// [AsmJit]
// Complete x86/x64 JIT and Remote Assembler for C++.
//
// [License]
// Zlib - See LICENSE.md file in the package.

// [Guard]
#ifndef _TEST_ASMJIT_TEST_OPCODE_H
#define _TEST_ASMJIT_TEST_OPCODE_H

// [Dependencies]
#include "../asmjit/asmjit.h"

namespace asmgen {

enum { kGenOpCodeInstCount = 2690 };

// Generate all instructions asmjit can emit.
static void opcode(asmjit::X86Assembler& a, bool useRex1 = false, bool useRex2 = false) {
  using namespace asmjit;
  using namespace asmjit::x86;

  // Prevent crash when the generated function is called to see disassembly.
  a.ret();

  // All instructions use the following register that can be changed to see if
  // the `X86Assembler` is properly encoding all possible combinations. If the
  // `useRexRegs` argument is true the `A` version will in most cases contain
  // a register having index 8 (if encodable).
  X86GpReg gLoA = useRex1 ? r8b : al;
  X86GpReg gLoB = useRex2 ? r9b : bl;

  X86GpReg gHiA = ah;
  X86GpReg gHiB = bh;

  X86GpReg gwA = useRex1 ? r8w : ax;
  X86GpReg gwB = useRex2 ? r9w : bx;

  X86GpReg gdA = useRex1 ? r8d : eax;
  X86GpReg gdB = useRex2 ? r9d : ebx;

  X86GpReg gzA = useRex1 ? r8  : a.zax;
  X86GpReg gzB = useRex2 ? r9  : a.zbx;
  X86GpReg gzC = useRex2 ? r10 : a.zcx;
  X86GpReg gzD = useRex2 ? r11 : a.zdx;

  X86FpReg fpA = fp0;
  X86FpReg fpB = fp7;

  X86MmReg mmA = mm0;
  X86MmReg mmB = mm1;

  X86XmmReg xmmA = useRex1 ? xmm8  : xmm0;
  X86XmmReg xmmB = useRex2 ? xmm9  : xmm1;
  X86XmmReg xmmC = useRex2 ? xmm10 : xmm2;
  X86XmmReg xmmD = useRex2 ? xmm11 : xmm3;

  X86YmmReg ymmA = useRex1 ? ymm8  : ymm0;
  X86YmmReg ymmB = useRex2 ? ymm9  : ymm1;
  X86YmmReg ymmC = useRex2 ? ymm10 : ymm2;
  X86YmmReg ymmD = useRex2 ? ymm11 : ymm3;

  X86Mem anyptr_gpA = ptr(gzA);
  X86Mem anyptr_gpB = ptr(gzB);
  X86Mem anyptr_gpC = ptr(gzC);
  X86Mem anyptr_gpD = ptr(gzD);

  X86Mem intptr_gpA = a.intptr_ptr(gzA);
  X86Mem intptr_gpB = a.intptr_ptr(gzB);

  X86Mem vmxptr_gpB = ptr(gzB, xmmB);
  X86Mem vmyptr_gpB = ptr(gzB, ymmB);

  Label L;

  // Base.
  a.adc(gLoA, 1);
  a.adc(gLoB, 1);
  a.adc(gHiA, 1);
  a.adc(gHiB, 1);
  a.adc(gwA, 1);
  a.adc(gwB, 1);
  a.adc(gdA, 1);
  a.adc(gdB, 1);
  a.adc(gzA, 1);
  a.adc(gzA, gzB);
  a.adc(gzA, intptr_gpB);
  a.adc(intptr_gpA, 1);
  a.adc(intptr_gpA, gzB);
  a.add(gLoA, 1);
  a.add(gLoB, 1);
  a.add(gHiA, 1);
  a.add(gHiB, 1);
  a.add(gwA, 1);
  a.add(gwB, 1);
  a.add(gdA, 1);
  a.add(gdB, 1);
  a.add(gzA, 1);
  a.add(gzA, gzB);
  a.add(gzA, intptr_gpB);
  a.add(intptr_gpA, 1);
  a.add(intptr_gpA, gzB);
  a.and_(gLoA, 1);
  a.and_(gLoB, 1);
  a.and_(gHiA, 1);
  a.and_(gHiB, 1);
  a.and_(gwA, 1);
  a.and_(gwB, 1);
  a.and_(gdA, 1);
  a.and_(gdB, 1);
  a.and_(gzA, 1);
  a.and_(gzA, gzB);
  a.and_(gzA, intptr_gpB);
  a.and_(intptr_gpA, 1);
  a.and_(intptr_gpA, gzB);
  a.bswap(gzA);
  a.bt(gzA, 1);
  a.bt(gzA, gzB);
  a.bt(intptr_gpA, 1);
  a.bt(intptr_gpA, gzB);
  a.btc(gzA, 1);
  a.btc(gzA, gzB);
  a.btc(intptr_gpA, 1);
  a.btc(intptr_gpA, gzB);
  a.btr(gzA, 1);
  a.btr(gzA, gzB);
  a.btr(intptr_gpA, 1);
  a.btr(intptr_gpA, gzB);
  a.bts(gzA, 1);
  a.bts(gzA, gzB);
  a.bts(intptr_gpA, 1);
  a.bts(intptr_gpA, gzB);
  a.call(gzA);
  a.call(intptr_gpA);
  a.cbw();
  a.cwde();
  a.clc();
  a.cld();
  a.cmc();
  a.cmp(gLoA, 1);
  a.cmp(gLoB, 1);
  a.cmp(gHiA, 1);
  a.cmp(gHiB, 1);
  a.cmp(gwA, 1);
  a.cmp(gwB, 1);
  a.cmp(gdA, 1);
  a.cmp(gdB, 1);
  a.cmp(gzA, 1);
  a.cmp(gzA, gzB);
  a.cmp(gzA, intptr_gpB);
  a.cmp(intptr_gpA, 1);
  a.cmp(intptr_gpA, gzB);
  a.cmpxchg(gzA, gzB);
  a.cmpxchg(intptr_gpA, gzB);
  a.cmpxchg8b(anyptr_gpA);
  a.cpuid();
  a.crc32(gzA, anyptr_gpB);
  a.dec(gzA);
  a.dec(intptr_gpA);
  a.div(gzA);
  a.div(intptr_gpA);
  a.idiv(gzA);
  a.idiv(intptr_gpA);
  a.imul(gzA);
  a.imul(intptr_gpA);
  a.imul(gzA, 1);
  a.imul(gzA, gzB);
  a.imul(gzA, gzB, 1);
  a.imul(gzA, intptr_gpB);
  a.imul(gzA, intptr_gpB, 1);
  a.inc(gzA);
  a.inc(intptr_gpA);
  a.int3();
  a.lea(gzA, intptr_gpB);
  a.mov(gzA, 1);
  a.mov(gzA, gzB);
  a.mov(gzA, intptr_gpB);
  a.mov(intptr_gpA, 1);
  a.mov(intptr_gpA, gzB);
  a.movsx(gzA, gLoB);
  a.movsx(gzA, byte_ptr(gzB));
  a.movzx(gzA, gLoB);
  a.movzx(gzA, byte_ptr(gzB));
  a.movbe(gzA, anyptr_gpB);
  a.movbe(anyptr_gpA, gzB);
  a.mul(gzA);
  a.mul(intptr_gpA);
  a.neg(gzA);
  a.neg(intptr_gpA);
  a.nop();
  a.not_(gzA);
  a.not_(intptr_gpA);
  a.or_(gLoA, 1);
  a.or_(gLoB, 1);
  a.or_(gHiA, 1);
  a.or_(gHiB, 1);
  a.or_(gwA, 1);
  a.or_(gwB, 1);
  a.or_(gdA, 1);
  a.or_(gdB, 1);
  a.or_(gzA, 1);
  a.or_(gzA, gzB);
  a.or_(gzA, intptr_gpB);
  a.or_(intptr_gpA, 1);
  a.or_(intptr_gpA, gzB);
  a.pop(gzA);
  a.pop(intptr_gpA);
  a.push(gzA);
  a.push(intptr_gpA);
  a.push(0);
  a.rcl(gzA, 0);
  a.rcl(gzA, 1);
  a.rcl(gzA, cl);
  a.rcl(intptr_gpA, 0);
  a.rcl(intptr_gpA, 1);
  a.rcl(intptr_gpA, cl);
  a.rcr(gzA, 0);
  a.rcr(gzA, 1);
  a.rcr(gzA, cl);
  a.rcr(intptr_gpA, 0);
  a.rcr(intptr_gpA, 1);
  a.rcr(intptr_gpA, cl);
  a.rdtsc();
  a.rdtscp();
  a.ret();
  a.ret(0);
  a.rol(gzA, 0);
  a.rol(gzA, 1);
  a.rol(gzA, cl);
  a.rol(intptr_gpA, 0);
  a.rol(intptr_gpA, 1);
  a.rol(intptr_gpA, cl);
  a.ror(gzA, 0);
  a.ror(gzA, 1);
  a.ror(gzA, cl);
  a.ror(intptr_gpA, 0);
  a.ror(intptr_gpA, 1);
  a.ror(intptr_gpA, cl);
  a.sbb(gLoA, 1);
  a.sbb(gLoB, 1);
  a.sbb(gHiA, 1);
  a.sbb(gHiB, 1);
  a.sbb(gwA, 1);
  a.sbb(gwB, 1);
  a.sbb(gdA, 1);
  a.sbb(gdB, 1);
  a.sbb(gzA, 1);
  a.sbb(gzA, gzB);
  a.sbb(gzA, intptr_gpB);
  a.sbb(intptr_gpA, 1);
  a.sbb(intptr_gpA, gzB);
  a.sal(gzA, cl);
  a.sal(gzA, 0);
  a.sal(gzA, 1);
  a.sal(intptr_gpA, 0);
  a.sal(intptr_gpA, 1);
  a.sal(intptr_gpA, cl);
  a.sar(gzA, 0);
  a.sar(gzA, 1);
  a.sar(gzA, cl);
  a.sar(intptr_gpA, 0);
  a.sar(intptr_gpA, 1);
  a.sar(intptr_gpA, cl);
  a.shl(gzA, 0);
  a.shl(gzA, 1);
  a.shl(gzA, cl);
  a.shl(intptr_gpA, 0);
  a.shl(intptr_gpA, 1);
  a.shl(intptr_gpA, cl);
  a.shr(gzA, 0);
  a.shr(gzA, 1);
  a.shr(gzA, cl);
  a.shr(intptr_gpA, 0);
  a.shr(intptr_gpA, 1);
  a.shr(intptr_gpA, cl);
  a.shld(gzA, gzB, 0);
  a.shld(gzA, gzB, cl);
  a.shld(intptr_gpA, gzB, 0);
  a.shld(intptr_gpA, gzB, cl);
  a.shrd(gzA, gzB, 0);
  a.shrd(gzA, gzB, cl);
  a.shrd(intptr_gpA, gzB, 0);
  a.shrd(intptr_gpA, gzB, cl);
  a.stc();
  a.std();
  a.sub(gLoA, 1);
  a.sub(gLoB, 1);
  a.sub(gHiA, 1);
  a.sub(gHiB, 1);
  a.sub(gwA, 1);
  a.sub(gwB, 1);
  a.sub(gdA, 1);
  a.sub(gdB, 1);
  a.sub(gzA, 1);
  a.sub(gzA, gzB);
  a.sub(gzA, intptr_gpB);
  a.sub(intptr_gpA, 1);
  a.sub(intptr_gpA, gzB);
  a.test(gzA, 1);
  a.test(gzA, gzB);
  a.test(intptr_gpA, 1);
  a.test(intptr_gpA, gzB);
  a.ud2();
  a.xadd(gzA, gzB);
  a.xadd(intptr_gpA, gzB);
  a.xchg(gzA, gzB);
  a.xchg(intptr_gpA, gzB);
  a.xchg(gzA, intptr_gpB);
  a.xor_(gLoA, 1);
  a.xor_(gLoB, 1);
  a.xor_(gHiA, 1);
  a.xor_(gHiB, 1);
  a.xor_(gwA, 1);
  a.xor_(gwB, 1);
  a.xor_(gdA, 1);
  a.xor_(gdB, 1);
  a.xor_(gzA, 1);
  a.xor_(gzA, gzB);
  a.xor_(gzA, intptr_gpB);
  a.xor_(intptr_gpA, 1);
  a.xor_(intptr_gpA, gzB);

  // Segment registers.
  a.nop();

  if (a.getArch() == kArchX86) {
    a.mov(es, ax);
    a.mov(es, bx);
    a.mov(ax, es);
    a.mov(bx, es);

    a.mov(cs, ax);
    a.mov(cs, bx);
    a.mov(ax, cs);
    a.mov(bx, cs);

    a.mov(ss, ax);
    a.mov(ss, bx);
    a.mov(ax, ss);
    a.mov(bx, ss);

    a.mov(ds, ax);
    a.mov(ds, bx);
    a.mov(ax, ds);
    a.mov(bx, ds);
  }

  a.mov(fs, ax);
  a.mov(fs, bx);
  a.mov(ax, fs);
  a.mov(bx, fs);

  a.mov(gs, ax);
  a.mov(gs, bx);
  a.mov(ax, gs);
  a.mov(bx, gs);

  // Instructions using REP prefix.
  a.nop();

  a.lodsb();
  a.lodsd();
  a.lodsw();
  a.rep_lodsb();
  a.rep_lodsd();
  a.rep_lodsw();

  a.movsb();
  a.movsd();
  a.movsw();
  a.rep_movsb();
  a.rep_movsd();
  a.rep_movsw();

  a.stosb();
  a.stosd();
  a.stosw();
  a.rep_stosb();
  a.rep_stosd();
  a.rep_stosw();

  a.cmpsb();
  a.cmpsd();
  a.cmpsw();
  a.repe_cmpsb();
  a.repe_cmpsd();
  a.repe_cmpsw();
  a.repne_cmpsb();
  a.repne_cmpsd();
  a.repne_cmpsw();

  a.scasb();
  a.scasd();
  a.scasw();
  a.repe_scasb();
  a.repe_scasd();
  a.repe_scasw();
  a.repne_scasb();
  a.repne_scasd();
  a.repne_scasw();

  // Label...Jcc/Jecxz/Jmp.
  a.nop();

  L = a.newLabel();
  a.bind(L);
  a.ja(L);
  a.jae(L);
  a.jb(L);
  a.jbe(L);
  a.jc(L);
  a.je(L);
  a.jg(L);
  a.jge(L);
  a.jl(L);
  a.jle(L);
  a.jna(L);
  a.jnae(L);
  a.jnb(L);
  a.jnbe(L);
  a.jnc(L);
  a.jne(L);
  a.jng(L);
  a.jnge(L);
  a.jnl(L);
  a.jnle(L);
  a.jno(L);
  a.jnp(L);
  a.jns(L);
  a.jnz(L);
  a.jo(L);
  a.jp(L);
  a.jpe(L);
  a.jpo(L);
  a.js(L);
  a.jz(L);
  a.jecxz(ecx, L);
  a.jmp(L);

  // Jcc/Jecxz/Jmp...Label.
  a.nop();

  L = a.newLabel();
  a.ja(L);
  a.jae(L);
  a.jb(L);
  a.jbe(L);
  a.jc(L);
  a.je(L);
  a.jg(L);
  a.jge(L);
  a.jl(L);
  a.jle(L);
  a.jna(L);
  a.jnae(L);
  a.jnb(L);
  a.jnbe(L);
  a.jnc(L);
  a.jne(L);
  a.jng(L);
  a.jnge(L);
  a.jnl(L);
  a.jnle(L);
  a.jno(L);
  a.jnp(L);
  a.jns(L);
  a.jnz(L);
  a.jo(L);
  a.jp(L);
  a.jpe(L);
  a.jpo(L);
  a.js(L);
  a.jz(L);
  a.jecxz(ecx, L);
  a.jmp(L);
  a.bind(L);

  // FPU.
  a.nop();

  a.f2xm1();
  a.fabs();
  a.fadd(fpA, fpB);
  a.fadd(fpB, fpA);
  a.fadd(dword_ptr(gzA));
  a.fadd(qword_ptr(gzA));
  a.faddp(fpB);
  a.faddp();
  a.fbld(dword_ptr(gzA));
  a.fbstp(dword_ptr(gzA));
  a.fchs();
  a.fclex();
  a.fcom(fpB);
  a.fcom();
  a.fcom(dword_ptr(gzA));
  a.fcom(qword_ptr(gzA));
  a.fcomp(fpB);
  a.fcomp();
  a.fcomp(dword_ptr(gzA));
  a.fcomp(qword_ptr(gzA));
  a.fcompp();
  a.fcos();
  a.fdecstp();
  a.fdiv(fpA, fpB);
  a.fdiv(fpB, fpA);
  a.fdiv(dword_ptr(gzA));
  a.fdiv(qword_ptr(gzA));
  a.fdivp(fpB);
  a.fdivp();
  a.fdivr(fpA, fpB);
  a.fdivr(fpB, fpA);
  a.fdivr(dword_ptr(gzA));
  a.fdivr(qword_ptr(gzA));
  a.fdivrp(fpB);
  a.fdivrp();
  a.fiadd(dword_ptr(gzA));
  a.ficom(word_ptr(gzA));
  a.ficom(dword_ptr(gzA));
  a.ficomp(word_ptr(gzA));
  a.ficomp(dword_ptr(gzA));
  a.fidiv(word_ptr(gzA));
  a.fidiv(dword_ptr(gzA));
  a.fidivr(word_ptr(gzA));
  a.fidivr(dword_ptr(gzA));
  a.fild(word_ptr(gzA));
  a.fild(dword_ptr(gzA));
  a.fild(qword_ptr(gzA));
  a.fimul(word_ptr(gzA));
  a.fimul(dword_ptr(gzA));
  a.fincstp();
  a.finit();
  a.fninit();
  a.fisub(word_ptr(gzA));
  a.fisub(dword_ptr(gzA));
  a.fisubr(word_ptr(gzA));
  a.fisubr(dword_ptr(gzA));
  a.fist(word_ptr(gzA));
  a.fist(dword_ptr(gzA));
  a.fistp(word_ptr(gzA));
  a.fistp(dword_ptr(gzA));
  a.fistp(qword_ptr(gzA));
  a.fld(dword_ptr(gzA));
  a.fld(qword_ptr(gzA));
  a.fld(tword_ptr(gzA));
  a.fld1();
  a.fldl2t();
  a.fldl2e();
  a.fldpi();
  a.fldlg2();
  a.fldln2();
  a.fldz();
  a.fldcw(anyptr_gpA);
  a.fldenv(anyptr_gpA);
  a.fmul(fpA, fpB);
  a.fmul(fpB, fpA);
  a.fmul(dword_ptr(gzA));
  a.fmul(qword_ptr(gzA));
  a.fmulp(fpB);
  a.fmulp();
  a.fnclex();
  a.fnop();
  a.fnsave(anyptr_gpA);
  a.fnstenv(anyptr_gpA);
  a.fnstcw(anyptr_gpA);
  a.fpatan();
  a.fprem();
  a.fprem1();
  a.fptan();
  a.frndint();
  a.frstor(anyptr_gpA);
  a.fsave(anyptr_gpA);
  a.fscale();
  a.fsin();
  a.fsincos();
  a.fsqrt();
  a.fst(dword_ptr(gzA));
  a.fst(qword_ptr(gzA));
  a.fstp(dword_ptr(gzA));
  a.fstp(qword_ptr(gzA));
  a.fstp(tword_ptr(gzA));
  a.fstcw(anyptr_gpA);
  a.fstenv(anyptr_gpA);
  a.fsub(fpA, fpB);
  a.fsub(fpB, fpA);
  a.fsub(dword_ptr(gzA));
  a.fsub(qword_ptr(gzA));
  a.fsubp(fpB);
  a.fsubp();
  a.fsubr(fpA, fpB);
  a.fsubr(fpB, fpA);
  a.fsubr(dword_ptr(gzA));
  a.fsubr(qword_ptr(gzA));
  a.fsubrp(fpB);
  a.fsubrp();
  a.ftst();
  a.fucom(fpB);
  a.fucom();
  a.fucom(fpB);
  a.fucomi(fpB);
  a.fucomip(fpB);
  a.fucomp(fpB);
  a.fucompp();
  a.fxam();
  a.fxrstor(anyptr_gpA);
  a.fxsave(anyptr_gpA);
  a.fxtract();
  a.fyl2x();
  a.fyl2xp1();

  // MMX/MMX-EXT.
  a.nop();

  a.movd(anyptr_gpA, mmB);
  a.movd(gdA, mmB);
  a.movd(mmA, anyptr_gpB);
  a.movd(mmA, gdB);
  a.movq(mmA, mmB);
  a.movq(anyptr_gpA, mmB);
  a.movq(mmA, anyptr_gpB);
  a.packuswb(mmA, mmB);
  a.packuswb(mmA, anyptr_gpB);
  a.paddb(mmA, mmB);
  a.paddb(mmA, anyptr_gpB);
  a.paddw(mmA, mmB);
  a.paddw(mmA, anyptr_gpB);
  a.paddd(mmA, mmB);
  a.paddd(mmA, anyptr_gpB);
  a.paddsb(mmA, mmB);
  a.paddsb(mmA, anyptr_gpB);
  a.paddsw(mmA, mmB);
  a.paddsw(mmA, anyptr_gpB);
  a.paddusb(mmA, mmB);
  a.paddusb(mmA, anyptr_gpB);
  a.paddusw(mmA, mmB);
  a.paddusw(mmA, anyptr_gpB);
  a.pand(mmA, mmB);
  a.pand(mmA, anyptr_gpB);
  a.pandn(mmA, mmB);
  a.pandn(mmA, anyptr_gpB);
  a.pcmpeqb(mmA, mmB);
  a.pcmpeqb(mmA, anyptr_gpB);
  a.pcmpeqw(mmA, mmB);
  a.pcmpeqw(mmA, anyptr_gpB);
  a.pcmpeqd(mmA, mmB);
  a.pcmpeqd(mmA, anyptr_gpB);
  a.pcmpgtb(mmA, mmB);
  a.pcmpgtb(mmA, anyptr_gpB);
  a.pcmpgtw(mmA, mmB);
  a.pcmpgtw(mmA, anyptr_gpB);
  a.pcmpgtd(mmA, mmB);
  a.pcmpgtd(mmA, anyptr_gpB);
  a.pmulhw(mmA, mmB);
  a.pmulhw(mmA, anyptr_gpB);
  a.pmullw(mmA, mmB);
  a.pmullw(mmA, anyptr_gpB);
  a.por(mmA, mmB);
  a.por(mmA, anyptr_gpB);
  a.pmaddwd(mmA, mmB);
  a.pmaddwd(mmA, anyptr_gpB);
  a.pslld(mmA, mmB);
  a.pslld(mmA, anyptr_gpB);
  a.pslld(mmA, 0);
  a.psllq(mmA, mmB);
  a.psllq(mmA, anyptr_gpB);
  a.psllq(mmA, 0);
  a.psllw(mmA, mmB);
  a.psllw(mmA, anyptr_gpB);
  a.psllw(mmA, 0);
  a.psrad(mmA, mmB);
  a.psrad(mmA, anyptr_gpB);
  a.psrad(mmA, 0);
  a.psraw(mmA, mmB);
  a.psraw(mmA, anyptr_gpB);
  a.psraw(mmA, 0);
  a.psrld(mmA, mmB);
  a.psrld(mmA, anyptr_gpB);
  a.psrld(mmA, 0);
  a.psrlq(mmA, mmB);
  a.psrlq(mmA, anyptr_gpB);
  a.psrlq(mmA, 0);
  a.psrlw(mmA, mmB);
  a.psrlw(mmA, anyptr_gpB);
  a.psrlw(mmA, 0);
  a.psubb(mmA, mmB);
  a.psubb(mmA, anyptr_gpB);
  a.psubw(mmA, mmB);
  a.psubw(mmA, anyptr_gpB);
  a.psubd(mmA, mmB);
  a.psubd(mmA, anyptr_gpB);
  a.psubsb(mmA, mmB);
  a.psubsb(mmA, anyptr_gpB);
  a.psubsw(mmA, mmB);
  a.psubsw(mmA, anyptr_gpB);
  a.psubusb(mmA, mmB);
  a.psubusb(mmA, anyptr_gpB);
  a.psubusw(mmA, mmB);
  a.psubusw(mmA, anyptr_gpB);
  a.punpckhbw(mmA, mmB);
  a.punpckhbw(mmA, anyptr_gpB);
  a.punpckhwd(mmA, mmB);
  a.punpckhwd(mmA, anyptr_gpB);
  a.punpckhdq(mmA, mmB);
  a.punpckhdq(mmA, anyptr_gpB);
  a.punpcklbw(mmA, mmB);
  a.punpcklbw(mmA, anyptr_gpB);
  a.punpcklwd(mmA, mmB);
  a.punpcklwd(mmA, anyptr_gpB);
  a.punpckldq(mmA, mmB);
  a.punpckldq(mmA, anyptr_gpB);
  a.pxor(mmA, mmB);
  a.pxor(mmA, anyptr_gpB);
  a.emms();

  // 3DNOW!
  a.nop();

  a.pf2id(mmA, mmB);
  a.pf2id(mmA, anyptr_gpB);
  a.pf2iw(mmA, mmB);
  a.pf2iw(mmA, anyptr_gpB);
  a.pfacc(mmA, mmB);
  a.pfacc(mmA, anyptr_gpB);
  a.pfadd(mmA, mmB);
  a.pfadd(mmA, anyptr_gpB);
  a.pfcmpeq(mmA, mmB);
  a.pfcmpeq(mmA, anyptr_gpB);
  a.pfcmpge(mmA, mmB);
  a.pfcmpge(mmA, anyptr_gpB);
  a.pfcmpgt(mmA, mmB);
  a.pfcmpgt(mmA, anyptr_gpB);
  a.pfmax(mmA, mmB);
  a.pfmax(mmA, anyptr_gpB);
  a.pfmin(mmA, mmB);
  a.pfmin(mmA, anyptr_gpB);
  a.pfmul(mmA, mmB);
  a.pfmul(mmA, anyptr_gpB);
  a.pfnacc(mmA, mmB);
  a.pfnacc(mmA, anyptr_gpB);
  a.pfpnacc(mmA, mmB);
  a.pfpnacc(mmA, anyptr_gpB);
  a.pfrcp(mmA, mmB);
  a.pfrcp(mmA, anyptr_gpB);
  a.pfrcpit1(mmA, mmB);
  a.pfrcpit1(mmA, anyptr_gpB);
  a.pfrcpit2(mmA, mmB);
  a.pfrcpit2(mmA, anyptr_gpB);
  a.pfrsqit1(mmA, mmB);
  a.pfrsqit1(mmA, anyptr_gpB);
  a.pfrsqrt(mmA, mmB);
  a.pfrsqrt(mmA, anyptr_gpB);
  a.pfsub(mmA, mmB);
  a.pfsub(mmA, anyptr_gpB);
  a.pfsubr(mmA, mmB);
  a.pfsubr(mmA, anyptr_gpB);
  a.pi2fd(mmA, mmB);
  a.pi2fd(mmA, anyptr_gpB);
  a.pi2fw(mmA, mmB);
  a.pi2fw(mmA, anyptr_gpB);
  a.pswapd(mmA, mmB);
  a.pswapd(mmA, anyptr_gpB);
  a.prefetch3dnow(anyptr_gpA);
  a.prefetchw3dnow(anyptr_gpA);
  a.femms();

  // SSE.
  a.nop();

  a.addps(xmmA, xmmB);
  a.addps(xmmA, anyptr_gpB);
  a.addss(xmmA, xmmB);
  a.addss(xmmA, anyptr_gpB);
  a.andnps(xmmA, xmmB);
  a.andnps(xmmA, anyptr_gpB);
  a.andps(xmmA, xmmB);
  a.andps(xmmA, anyptr_gpB);
  a.cmpps(xmmA, xmmB, 0);
  a.cmpps(xmmA, anyptr_gpB, 0);
  a.cmpss(xmmA, xmmB, 0);
  a.cmpss(xmmA, anyptr_gpB, 0);
  a.comiss(xmmA, xmmB);
  a.comiss(xmmA, anyptr_gpB);
  a.cvtpi2ps(xmmA, mmB);
  a.cvtpi2ps(xmmA, anyptr_gpB);
  a.cvtps2pi(mmA, xmmB);
  a.cvtps2pi(mmA, anyptr_gpB);
  a.cvtsi2ss(xmmA, gzB);
  a.cvtsi2ss(xmmA, anyptr_gpB);
  a.cvtss2si(gzA, xmmB);
  a.cvtss2si(gzA, anyptr_gpB);
  a.cvttps2pi(mmA, xmmB);
  a.cvttps2pi(mmA, anyptr_gpB);
  a.cvttss2si(gzA, xmmB);
  a.cvttss2si(gzA, anyptr_gpB);
  a.divps(xmmA, xmmB);
  a.divps(xmmA, anyptr_gpB);
  a.divss(xmmA, xmmB);
  a.divss(xmmA, anyptr_gpB);
  a.ldmxcsr(anyptr_gpA);
  a.maskmovq(mmA, mmB);
  a.maxps(xmmA, xmmB);
  a.maxps(xmmA, anyptr_gpB);
  a.maxss(xmmA, xmmB);
  a.maxss(xmmA, anyptr_gpB);
  a.minps(xmmA, xmmB);
  a.minps(xmmA, anyptr_gpB);
  a.minss(xmmA, xmmB);
  a.minss(xmmA, anyptr_gpB);
  a.movaps(xmmA, xmmB);
  a.movaps(xmmA, anyptr_gpB);
  a.movaps(anyptr_gpA, xmmB);
  a.movd(anyptr_gpA, xmmB);
  a.movd(gdA, xmmB);
  a.movd(xmmA, anyptr_gpB);
  a.movd(xmmA, gdB);
  a.movq(mmA, mmB);
  a.movq(xmmA, xmmB);
  a.movq(anyptr_gpA, xmmB);
  a.movq(xmmA, anyptr_gpB);
  a.movntq(anyptr_gpA, mmB);
  a.movhlps(xmmA, xmmB);
  a.movhps(xmmA, anyptr_gpB);
  a.movhps(anyptr_gpA, xmmB);
  a.movlhps(xmmA, xmmB);
  a.movlps(xmmA, anyptr_gpB);
  a.movlps(anyptr_gpA, xmmB);
  a.movntps(anyptr_gpA, xmmB);
  a.movss(xmmA, anyptr_gpB);
  a.movss(anyptr_gpA, xmmB);
  a.movups(xmmA, xmmB);
  a.movups(xmmA, anyptr_gpB);
  a.movups(anyptr_gpA, xmmB);
  a.mulps(xmmA, xmmB);
  a.mulps(xmmA, anyptr_gpB);
  a.mulss(xmmA, xmmB);
  a.mulss(xmmA, anyptr_gpB);
  a.orps(xmmA, xmmB);
  a.orps(xmmA, anyptr_gpB);
  a.pavgb(mmA, mmB);
  a.pavgb(mmA, anyptr_gpB);
  a.pavgw(mmA, mmB);
  a.pavgw(mmA, anyptr_gpB);
  a.pextrw(gzA, mmB, 0);
  a.pinsrw(mmA, gdB, 0);
  a.pinsrw(mmA, anyptr_gpB, 0);
  a.pmaxsw(mmA, mmB);
  a.pmaxsw(mmA, anyptr_gpB);
  a.pmaxub(mmA, mmB);
  a.pmaxub(mmA, anyptr_gpB);
  a.pminsw(mmA, mmB);
  a.pminsw(mmA, anyptr_gpB);
  a.pminub(mmA, mmB);
  a.pminub(mmA, anyptr_gpB);
  a.pmovmskb(gzA, mmB);
  a.pmulhuw(mmA, mmB);
  a.pmulhuw(mmA, anyptr_gpB);
  a.psadbw(mmA, mmB);
  a.psadbw(mmA, anyptr_gpB);
  a.pshufw(mmA, mmB, 0);
  a.pshufw(mmA, anyptr_gpB, 0);
  a.rcpps(xmmA, xmmB);
  a.rcpps(xmmA, anyptr_gpB);
  a.rcpss(xmmA, xmmB);
  a.rcpss(xmmA, anyptr_gpB);
  a.prefetch(anyptr_gpA, 0);
  a.psadbw(xmmA, xmmB);
  a.psadbw(xmmA, anyptr_gpB);
  a.rsqrtps(xmmA, xmmB);
  a.rsqrtps(xmmA, anyptr_gpB);
  a.rsqrtss(xmmA, xmmB);
  a.rsqrtss(xmmA, anyptr_gpB);
  a.sfence();
  a.shufps(xmmA, xmmB, 0);
  a.shufps(xmmA, anyptr_gpB, 0);
  a.sqrtps(xmmA, xmmB);
  a.sqrtps(xmmA, anyptr_gpB);
  a.sqrtss(xmmA, xmmB);
  a.sqrtss(xmmA, anyptr_gpB);
  a.stmxcsr(anyptr_gpA);
  a.subps(xmmA, xmmB);
  a.subps(xmmA, anyptr_gpB);
  a.subss(xmmA, xmmB);
  a.subss(xmmA, anyptr_gpB);
  a.ucomiss(xmmA, xmmB);
  a.ucomiss(xmmA, anyptr_gpB);
  a.unpckhps(xmmA, xmmB);
  a.unpckhps(xmmA, anyptr_gpB);
  a.unpcklps(xmmA, xmmB);
  a.unpcklps(xmmA, anyptr_gpB);
  a.xorps(xmmA, xmmB);
  a.xorps(xmmA, anyptr_gpB);

  // SSE2.
  a.nop();

  a.addpd(xmmA, xmmB);
  a.addpd(xmmA, anyptr_gpB);
  a.addsd(xmmA, xmmB);
  a.addsd(xmmA, anyptr_gpB);
  a.andnpd(xmmA, xmmB);
  a.andnpd(xmmA, anyptr_gpB);
  a.andpd(xmmA, xmmB);
  a.andpd(xmmA, anyptr_gpB);
  a.clflush(anyptr_gpA);
  a.cmppd(xmmA, xmmB, 0);
  a.cmppd(xmmA, anyptr_gpB, 0);
  a.cmpsd(xmmA, xmmB, 0);
  a.cmpsd(xmmA, anyptr_gpB, 0);
  a.comisd(xmmA, xmmB);
  a.comisd(xmmA, anyptr_gpB);
  a.cvtdq2pd(xmmA, xmmB);
  a.cvtdq2pd(xmmA, anyptr_gpB);
  a.cvtdq2ps(xmmA, xmmB);
  a.cvtdq2ps(xmmA, anyptr_gpB);
  a.cvtpd2dq(xmmA, xmmB);
  a.cvtpd2dq(xmmA, anyptr_gpB);
  a.cvtpd2pi(mmA, xmmB);
  a.cvtpd2pi(mmA, anyptr_gpB);
  a.cvtpd2ps(xmmA, xmmB);
  a.cvtpd2ps(xmmA, anyptr_gpB);
  a.cvtpi2pd(xmmA, mmB);
  a.cvtpi2pd(xmmA, anyptr_gpB);
  a.cvtps2dq(xmmA, xmmB);
  a.cvtps2dq(xmmA, anyptr_gpB);
  a.cvtps2pd(xmmA, xmmB);
  a.cvtps2pd(xmmA, anyptr_gpB);
  a.cvtsd2si(gzA, xmmB);
  a.cvtsd2si(gzA, anyptr_gpB);
  a.cvtsd2ss(xmmA, xmmB);
  a.cvtsd2ss(xmmA, anyptr_gpB);
  a.cvtsi2sd(xmmA, gzB);
  a.cvtsi2sd(xmmA, anyptr_gpB);
  a.cvtss2sd(xmmA, xmmB);
  a.cvtss2sd(xmmA, anyptr_gpB);
  a.cvtss2si(gzA, xmmB);
  a.cvtss2si(gzA, anyptr_gpB);
  a.cvttpd2pi(mmA, xmmB);
  a.cvttpd2pi(mmA, anyptr_gpB);
  a.cvttpd2dq(xmmA, xmmB);
  a.cvttpd2dq(xmmA, anyptr_gpB);
  a.cvttps2dq(xmmA, xmmB);
  a.cvttps2dq(xmmA, anyptr_gpB);
  a.cvttsd2si(gzA, xmmB);
  a.cvttsd2si(gzA, anyptr_gpB);
  a.divpd(xmmA, xmmB);
  a.divpd(xmmA, anyptr_gpB);
  a.divsd(xmmA, xmmB);
  a.divsd(xmmA, anyptr_gpB);
  a.lfence();
  a.maskmovdqu(xmmA, xmmB);
  a.maxpd(xmmA, xmmB);
  a.maxpd(xmmA, anyptr_gpB);
  a.maxsd(xmmA, xmmB);
  a.maxsd(xmmA, anyptr_gpB);
  a.mfence();
  a.minpd(xmmA, xmmB);
  a.minpd(xmmA, anyptr_gpB);
  a.minsd(xmmA, xmmB);
  a.minsd(xmmA, anyptr_gpB);
  a.movdqa(xmmA, xmmB);
  a.movdqa(xmmA, anyptr_gpB);
  a.movdqa(anyptr_gpA, xmmB);
  a.movdqu(xmmA, xmmB);
  a.movdqu(xmmA, anyptr_gpB);
  a.movdqu(anyptr_gpA, xmmB);
  a.movmskps(gzA, xmmB);
  a.movmskpd(gzA, xmmB);
  a.movsd(xmmA, xmmB);
  a.movsd(xmmA, anyptr_gpB);
  a.movsd(anyptr_gpA, xmmB);
  a.movapd(xmmA, anyptr_gpB);
  a.movapd(anyptr_gpA, xmmB);
  a.movdq2q(mmA, xmmB);
  a.movq2dq(xmmA, mmB);
  a.movhpd(xmmA, anyptr_gpB);
  a.movhpd(anyptr_gpA, xmmB);
  a.movlpd(xmmA, anyptr_gpB);
  a.movlpd(anyptr_gpA, xmmB);
  a.movntdq(anyptr_gpA, xmmB);
  a.movnti(anyptr_gpA, gzB);
  a.movntpd(anyptr_gpA, xmmB);
  a.movupd(xmmA, anyptr_gpB);
  a.movupd(anyptr_gpA, xmmB);
  a.mulpd(xmmA, xmmB);
  a.mulpd(xmmA, anyptr_gpB);
  a.mulsd(xmmA, xmmB);
  a.mulsd(xmmA, anyptr_gpB);
  a.orpd(xmmA, xmmB);
  a.orpd(xmmA, anyptr_gpB);
  a.packsswb(xmmA, xmmB);
  a.packsswb(xmmA, anyptr_gpB);
  a.packssdw(xmmA, xmmB);
  a.packssdw(xmmA, anyptr_gpB);
  a.packuswb(xmmA, xmmB);
  a.packuswb(xmmA, anyptr_gpB);
  a.paddb(xmmA, xmmB);
  a.paddb(xmmA, anyptr_gpB);
  a.paddw(xmmA, xmmB);
  a.paddw(xmmA, anyptr_gpB);
  a.paddd(xmmA, xmmB);
  a.paddd(xmmA, anyptr_gpB);
  a.paddq(mmA, mmB);
  a.paddq(mmA, anyptr_gpB);
  a.paddq(xmmA, xmmB);
  a.paddq(xmmA, anyptr_gpB);
  a.paddsb(xmmA, xmmB);
  a.paddsb(xmmA, anyptr_gpB);
  a.paddsw(xmmA, xmmB);
  a.paddsw(xmmA, anyptr_gpB);
  a.paddusb(xmmA, xmmB);
  a.paddusb(xmmA, anyptr_gpB);
  a.paddusw(xmmA, xmmB);
  a.paddusw(xmmA, anyptr_gpB);
  a.pand(xmmA, xmmB);
  a.pand(xmmA, anyptr_gpB);
  a.pandn(xmmA, xmmB);
  a.pandn(xmmA, anyptr_gpB);
  a.pause();
  a.pavgb(xmmA, xmmB);
  a.pavgb(xmmA, anyptr_gpB);
  a.pavgw(xmmA, xmmB);
  a.pavgw(xmmA, anyptr_gpB);
  a.pcmpeqb(xmmA, xmmB);
  a.pcmpeqb(xmmA, anyptr_gpB);
  a.pcmpeqw(xmmA, xmmB);
  a.pcmpeqw(xmmA, anyptr_gpB);
  a.pcmpeqd(xmmA, xmmB);
  a.pcmpeqd(xmmA, anyptr_gpB);
  a.pcmpgtb(xmmA, xmmB);
  a.pcmpgtb(xmmA, anyptr_gpB);
  a.pcmpgtw(xmmA, xmmB);
  a.pcmpgtw(xmmA, anyptr_gpB);
  a.pcmpgtd(xmmA, xmmB);
  a.pcmpgtd(xmmA, anyptr_gpB);
  a.pmaxsw(xmmA, xmmB);
  a.pmaxsw(xmmA, anyptr_gpB);
  a.pmaxub(xmmA, xmmB);
  a.pmaxub(xmmA, anyptr_gpB);
  a.pminsw(xmmA, xmmB);
  a.pminsw(xmmA, anyptr_gpB);
  a.pminub(xmmA, xmmB);
  a.pminub(xmmA, anyptr_gpB);
  a.pmovmskb(gzA, xmmB);
  a.pmulhw(xmmA, xmmB);
  a.pmulhw(xmmA, anyptr_gpB);
  a.pmulhuw(xmmA, xmmB);
  a.pmulhuw(xmmA, anyptr_gpB);
  a.pmullw(xmmA, xmmB);
  a.pmullw(xmmA, anyptr_gpB);
  a.pmuludq(mmA, mmB);
  a.pmuludq(mmA, anyptr_gpB);
  a.pmuludq(xmmA, xmmB);
  a.pmuludq(xmmA, anyptr_gpB);
  a.por(xmmA, xmmB);
  a.por(xmmA, anyptr_gpB);
  a.pslld(xmmA, xmmB);
  a.pslld(xmmA, anyptr_gpB);
  a.pslld(xmmA, 0);
  a.psllq(xmmA, xmmB);
  a.psllq(xmmA, anyptr_gpB);
  a.psllq(xmmA, 0);
  a.psllw(xmmA, xmmB);
  a.psllw(xmmA, anyptr_gpB);
  a.psllw(xmmA, 0);
  a.pslldq(xmmA, 0);
  a.psrad(xmmA, xmmB);
  a.psrad(xmmA, anyptr_gpB);
  a.psrad(xmmA, 0);
  a.psraw(xmmA, xmmB);
  a.psraw(xmmA, anyptr_gpB);
  a.psraw(xmmA, 0);
  a.psubb(xmmA, xmmB);
  a.psubb(xmmA, anyptr_gpB);
  a.psubw(xmmA, xmmB);
  a.psubw(xmmA, anyptr_gpB);
  a.psubd(xmmA, xmmB);
  a.psubd(xmmA, anyptr_gpB);
  a.psubq(mmA, mmB);
  a.psubq(mmA, anyptr_gpB);
  a.psubq(xmmA, xmmB);
  a.psubq(xmmA, anyptr_gpB);
  a.pmaddwd(xmmA, xmmB);
  a.pmaddwd(xmmA, anyptr_gpB);
  a.pshufd(xmmA, xmmB, 0);
  a.pshufd(xmmA, anyptr_gpB, 0);
  a.pshufhw(xmmA, xmmB, 0);
  a.pshufhw(xmmA, anyptr_gpB, 0);
  a.pshuflw(xmmA, xmmB, 0);
  a.pshuflw(xmmA, anyptr_gpB, 0);
  a.psrld(xmmA, xmmB);
  a.psrld(xmmA, anyptr_gpB);
  a.psrld(xmmA, 0);
  a.psrlq(xmmA, xmmB);
  a.psrlq(xmmA, anyptr_gpB);
  a.psrlq(xmmA, 0);
  a.psrldq(xmmA, 0);
  a.psrlw(xmmA, xmmB);
  a.psrlw(xmmA, anyptr_gpB);
  a.psrlw(xmmA, 0);
  a.psubsb(xmmA, xmmB);
  a.psubsb(xmmA, anyptr_gpB);
  a.psubsw(xmmA, xmmB);
  a.psubsw(xmmA, anyptr_gpB);
  a.psubusb(xmmA, xmmB);
  a.psubusb(xmmA, anyptr_gpB);
  a.psubusw(xmmA, xmmB);
  a.psubusw(xmmA, anyptr_gpB);
  a.punpckhbw(xmmA, xmmB);
  a.punpckhbw(xmmA, anyptr_gpB);
  a.punpckhwd(xmmA, xmmB);
  a.punpckhwd(xmmA, anyptr_gpB);
  a.punpckhdq(xmmA, xmmB);
  a.punpckhdq(xmmA, anyptr_gpB);
  a.punpckhqdq(xmmA, xmmB);
  a.punpckhqdq(xmmA, anyptr_gpB);
  a.punpcklbw(xmmA, xmmB);
  a.punpcklbw(xmmA, anyptr_gpB);
  a.punpcklwd(xmmA, xmmB);
  a.punpcklwd(xmmA, anyptr_gpB);
  a.punpckldq(xmmA, xmmB);
  a.punpckldq(xmmA, anyptr_gpB);
  a.punpcklqdq(xmmA, xmmB);
  a.punpcklqdq(xmmA, anyptr_gpB);
  a.pxor(xmmA, xmmB);
  a.pxor(xmmA, anyptr_gpB);
  a.sqrtpd(xmmA, xmmB);
  a.sqrtpd(xmmA, anyptr_gpB);
  a.sqrtsd(xmmA, xmmB);
  a.sqrtsd(xmmA, anyptr_gpB);
  a.subpd(xmmA, xmmB);
  a.subpd(xmmA, anyptr_gpB);
  a.subsd(xmmA, xmmB);
  a.subsd(xmmA, anyptr_gpB);
  a.ucomisd(xmmA, xmmB);
  a.ucomisd(xmmA, anyptr_gpB);
  a.unpckhpd(xmmA, xmmB);
  a.unpckhpd(xmmA, anyptr_gpB);
  a.unpcklpd(xmmA, xmmB);
  a.unpcklpd(xmmA, anyptr_gpB);
  a.xorpd(xmmA, xmmB);
  a.xorpd(xmmA, anyptr_gpB);

  // SSE3.
  a.nop();

  a.addsubpd(xmmA, xmmB);
  a.addsubpd(xmmA, anyptr_gpB);
  a.addsubps(xmmA, xmmB);
  a.addsubps(xmmA, anyptr_gpB);
  a.fisttp(dword_ptr(gzA));
  a.haddpd(xmmA, xmmB);
  a.haddpd(xmmA, anyptr_gpB);
  a.haddps(xmmA, xmmB);
  a.haddps(xmmA, anyptr_gpB);
  a.hsubpd(xmmA, xmmB);
  a.hsubpd(xmmA, anyptr_gpB);
  a.hsubps(xmmA, xmmB);
  a.hsubps(xmmA, anyptr_gpB);
  a.lddqu(xmmA, anyptr_gpB);
  a.monitor();
  a.movddup(xmmA, xmmB);
  a.movddup(xmmA, anyptr_gpB);
  a.movshdup(xmmA, xmmB);
  a.movshdup(xmmA, anyptr_gpB);
  a.movsldup(xmmA, xmmB);
  a.movsldup(xmmA, anyptr_gpB);
  a.mwait();

  // SSSE3.
  a.nop();

  a.psignb(mmA, mmB);
  a.psignb(mmA, anyptr_gpB);
  a.psignb(xmmA, xmmB);
  a.psignb(xmmA, anyptr_gpB);
  a.psignw(mmA, mmB);
  a.psignw(mmA, anyptr_gpB);
  a.psignw(xmmA, xmmB);
  a.psignw(xmmA, anyptr_gpB);
  a.psignd(mmA, mmB);
  a.psignd(mmA, anyptr_gpB);
  a.psignd(xmmA, xmmB);
  a.psignd(xmmA, anyptr_gpB);
  a.phaddw(mmA, mmB);
  a.phaddw(mmA, anyptr_gpB);
  a.phaddw(xmmA, xmmB);
  a.phaddw(xmmA, anyptr_gpB);
  a.phaddd(mmA, mmB);
  a.phaddd(mmA, anyptr_gpB);
  a.phaddd(xmmA, xmmB);
  a.phaddd(xmmA, anyptr_gpB);
  a.phaddsw(mmA, mmB);
  a.phaddsw(mmA, anyptr_gpB);
  a.phaddsw(xmmA, xmmB);
  a.phaddsw(xmmA, anyptr_gpB);
  a.phsubw(mmA, mmB);
  a.phsubw(mmA, anyptr_gpB);
  a.phsubw(xmmA, xmmB);
  a.phsubw(xmmA, anyptr_gpB);
  a.phsubd(mmA, mmB);
  a.phsubd(mmA, anyptr_gpB);
  a.phsubd(xmmA, xmmB);
  a.phsubd(xmmA, anyptr_gpB);
  a.phsubsw(mmA, mmB);
  a.phsubsw(mmA, anyptr_gpB);
  a.phsubsw(xmmA, xmmB);
  a.phsubsw(xmmA, anyptr_gpB);
  a.pmaddubsw(mmA, mmB);
  a.pmaddubsw(mmA, anyptr_gpB);
  a.pmaddubsw(xmmA, xmmB);
  a.pmaddubsw(xmmA, anyptr_gpB);
  a.pabsb(mmA, mmB);
  a.pabsb(mmA, anyptr_gpB);
  a.pabsb(xmmA, xmmB);
  a.pabsb(xmmA, anyptr_gpB);
  a.pabsw(mmA, mmB);
  a.pabsw(mmA, anyptr_gpB);
  a.pabsw(xmmA, xmmB);
  a.pabsw(xmmA, anyptr_gpB);
  a.pabsd(mmA, mmB);
  a.pabsd(mmA, anyptr_gpB);
  a.pabsd(xmmA, xmmB);
  a.pabsd(xmmA, anyptr_gpB);
  a.pmulhrsw(mmA, mmB);
  a.pmulhrsw(mmA, anyptr_gpB);
  a.pmulhrsw(xmmA, xmmB);
  a.pmulhrsw(xmmA, anyptr_gpB);
  a.pshufb(mmA, mmB);
  a.pshufb(mmA, anyptr_gpB);
  a.pshufb(xmmA, xmmB);
  a.pshufb(xmmA, anyptr_gpB);
  a.palignr(mmA, mmB, 0);
  a.palignr(mmA, anyptr_gpB, 0);
  a.palignr(xmmA, xmmB, 0);
  a.palignr(xmmA, anyptr_gpB, 0);

  // SSE4.1.
  a.nop();

  a.blendpd(xmmA, xmmB, 0);
  a.blendpd(xmmA, anyptr_gpB, 0);
  a.blendps(xmmA, xmmB, 0);
  a.blendps(xmmA, anyptr_gpB, 0);
  a.blendvpd(xmmA, xmmB);
  a.blendvpd(xmmA, anyptr_gpB);
  a.blendvps(xmmA, xmmB);
  a.blendvps(xmmA, anyptr_gpB);
  a.dppd(xmmA, xmmB, 0);
  a.dppd(xmmA, anyptr_gpB, 0);
  a.dpps(xmmA, xmmB, 0);
  a.dpps(xmmA, anyptr_gpB, 0);
  a.extractps(gzA, xmmB, 0);
  a.extractps(anyptr_gpA, xmmB, 0);
  a.insertps(xmmA, xmmB, 0);
  a.insertps(xmmA, anyptr_gpB, 0);
  a.movntdqa(xmmA, anyptr_gpB);
  a.mpsadbw(xmmA, xmmB, 0);
  a.mpsadbw(xmmA, anyptr_gpB, 0);
  a.packusdw(xmmA, xmmB);
  a.packusdw(xmmA, anyptr_gpB);
  a.pblendvb(xmmA, xmmB);
  a.pblendvb(xmmA, anyptr_gpB);
  a.pblendw(xmmA, xmmB, 0);
  a.pblendw(xmmA, anyptr_gpB, 0);
  a.pcmpeqq(xmmA, xmmB);
  a.pcmpeqq(xmmA, anyptr_gpB);
  a.pextrb(gzA, xmmB, 0);
  a.pextrb(anyptr_gpA, xmmB, 0);
  a.pextrd(gzA, xmmB, 0);
  a.pextrd(anyptr_gpA, xmmB, 0);
  a.pextrq(gzA, xmmB, 0);
  a.pextrq(anyptr_gpA, xmmB, 0);
  a.pextrw(gzA, xmmB, 0);
  a.pextrw(anyptr_gpA, xmmB, 0);
  a.phminposuw(xmmA, xmmB);
  a.phminposuw(xmmA, anyptr_gpB);
  a.pinsrb(xmmA, gdB, 0);
  a.pinsrb(xmmA, anyptr_gpB, 0);
  a.pinsrd(xmmA, gdB, 0);
  a.pinsrd(xmmA, anyptr_gpB, 0);
  a.pinsrw(xmmA, gdB, 0);
  a.pinsrw(xmmA, anyptr_gpB, 0);
  a.pmaxuw(xmmA, xmmB);
  a.pmaxuw(xmmA, anyptr_gpB);
  a.pmaxsb(xmmA, xmmB);
  a.pmaxsb(xmmA, anyptr_gpB);
  a.pmaxsd(xmmA, xmmB);
  a.pmaxsd(xmmA, anyptr_gpB);
  a.pmaxud(xmmA, xmmB);
  a.pmaxud(xmmA, anyptr_gpB);
  a.pminsb(xmmA, xmmB);
  a.pminsb(xmmA, anyptr_gpB);
  a.pminuw(xmmA, xmmB);
  a.pminuw(xmmA, anyptr_gpB);
  a.pminud(xmmA, xmmB);
  a.pminud(xmmA, anyptr_gpB);
  a.pminsd(xmmA, xmmB);
  a.pminsd(xmmA, anyptr_gpB);
  a.pmovsxbw(xmmA, xmmB);
  a.pmovsxbw(xmmA, anyptr_gpB);
  a.pmovsxbd(xmmA, xmmB);
  a.pmovsxbd(xmmA, anyptr_gpB);
  a.pmovsxbq(xmmA, xmmB);
  a.pmovsxbq(xmmA, anyptr_gpB);
  a.pmovsxwd(xmmA, xmmB);
  a.pmovsxwd(xmmA, anyptr_gpB);
  a.pmovsxwq(xmmA, xmmB);
  a.pmovsxwq(xmmA, anyptr_gpB);
  a.pmovsxdq(xmmA, xmmB);
  a.pmovsxdq(xmmA, anyptr_gpB);
  a.pmovzxbw(xmmA, xmmB);
  a.pmovzxbw(xmmA, anyptr_gpB);
  a.pmovzxbd(xmmA, xmmB);
  a.pmovzxbd(xmmA, anyptr_gpB);
  a.pmovzxbq(xmmA, xmmB);
  a.pmovzxbq(xmmA, anyptr_gpB);
  a.pmovzxwd(xmmA, xmmB);
  a.pmovzxwd(xmmA, anyptr_gpB);
  a.pmovzxwq(xmmA, xmmB);
  a.pmovzxwq(xmmA, anyptr_gpB);
  a.pmovzxdq(xmmA, xmmB);
  a.pmovzxdq(xmmA, anyptr_gpB);
  a.pmuldq(xmmA, xmmB);
  a.pmuldq(xmmA, anyptr_gpB);
  a.pmulld(xmmA, xmmB);
  a.pmulld(xmmA, anyptr_gpB);
  a.ptest(xmmA, xmmB);
  a.ptest(xmmA, anyptr_gpB);
  a.roundps(xmmA, xmmB, 0);
  a.roundps(xmmA, anyptr_gpB, 0);
  a.roundss(xmmA, xmmB, 0);
  a.roundss(xmmA, anyptr_gpB, 0);
  a.roundpd(xmmA, xmmB, 0);
  a.roundpd(xmmA, anyptr_gpB, 0);
  a.roundsd(xmmA, xmmB, 0);
  a.roundsd(xmmA, anyptr_gpB, 0);

  // SSE4.2.
  a.nop();

  a.pcmpestri(xmmA, xmmB, 0);
  a.pcmpestri(xmmA, anyptr_gpB, 0);
  a.pcmpestrm(xmmA, xmmB, 0);
  a.pcmpestrm(xmmA, anyptr_gpB, 0);
  a.pcmpistri(xmmA, xmmB, 0);
  a.pcmpistri(xmmA, anyptr_gpB, 0);
  a.pcmpistrm(xmmA, xmmB, 0);
  a.pcmpistrm(xmmA, anyptr_gpB, 0);
  a.pcmpgtq(xmmA, xmmB);
  a.pcmpgtq(xmmA, anyptr_gpB);

  // SSE4a.
  a.nop();

  a.extrq(xmmA, xmmB);
  a.extrq(xmmA, 0x1, 0x2);
  a.extrq(xmmB, 0x1, 0x2);
  a.insertq(xmmA, xmmB);
  a.insertq(xmmA, xmmB, 0x1, 0x2);
  a.movntsd(anyptr_gpA, xmmB);
  a.movntss(anyptr_gpA, xmmB);

  // POPCNT.
  a.nop();

  a.popcnt(gzA, gzB);
  a.popcnt(gzA, anyptr_gpB);

  // AESNI.
  a.nop();

  a.aesdec(xmmA, xmmB);
  a.aesdec(xmmA, anyptr_gpB);
  a.aesdeclast(xmmA, xmmB);
  a.aesdeclast(xmmA, anyptr_gpB);
  a.aesenc(xmmA, xmmB);
  a.aesenc(xmmA, anyptr_gpB);
  a.aesenclast(xmmA, xmmB);
  a.aesenclast(xmmA, anyptr_gpB);
  a.aesimc(xmmA, xmmB);
  a.aesimc(xmmA, anyptr_gpB);
  a.aeskeygenassist(xmmA, xmmB, 0);
  a.aeskeygenassist(xmmA, anyptr_gpB, 0);

  // PCLMULQDQ.
  a.nop();

  a.pclmulqdq(xmmA, xmmB, 0);
  a.pclmulqdq(xmmA, anyptr_gpB, 0);

  // XSAVE.
  a.nop();

  a.xgetbv();
  a.xsetbv();

  a.xsave(anyptr_gpA);
  a.xsaveopt(anyptr_gpA);
  a.xrstor(anyptr_gpA);

  // AVX.
  a.nop();

  a.vaddpd(xmmA, xmmB, xmmC);
  a.vaddpd(xmmA, xmmB, anyptr_gpC);
  a.vaddpd(ymmA, ymmB, ymmC);
  a.vaddpd(ymmA, ymmB, anyptr_gpC);
  a.vaddps(xmmA, xmmB, xmmC);
  a.vaddps(xmmA, xmmB, anyptr_gpC);
  a.vaddps(ymmA, ymmB, ymmC);
  a.vaddps(ymmA, ymmB, anyptr_gpC);
  a.vaddsd(xmmA, xmmB, xmmC);
  a.vaddsd(xmmA, xmmB, anyptr_gpC);
  a.vaddss(xmmA, xmmB, xmmC);
  a.vaddss(xmmA, xmmB, anyptr_gpC);
  a.vaddsubpd(xmmA, xmmB, xmmC);
  a.vaddsubpd(xmmA, xmmB, anyptr_gpC);
  a.vaddsubpd(ymmA, ymmB, ymmC);
  a.vaddsubpd(ymmA, ymmB, anyptr_gpC);
  a.vaddsubps(xmmA, xmmB, xmmC);
  a.vaddsubps(xmmA, xmmB, anyptr_gpC);
  a.vaddsubps(ymmA, ymmB, ymmC);
  a.vaddsubps(ymmA, ymmB, anyptr_gpC);
  a.vandpd(xmmA, xmmB, xmmC);
  a.vandpd(xmmA, xmmB, anyptr_gpC);
  a.vandpd(ymmA, ymmB, ymmC);
  a.vandpd(ymmA, ymmB, anyptr_gpC);
  a.vandps(xmmA, xmmB, xmmC);
  a.vandps(xmmA, xmmB, anyptr_gpC);
  a.vandps(ymmA, ymmB, ymmC);
  a.vandps(ymmA, ymmB, anyptr_gpC);
  a.vandnpd(xmmA, xmmB, xmmC);
  a.vandnpd(xmmA, xmmB, anyptr_gpC);
  a.vandnpd(ymmA, ymmB, ymmC);
  a.vandnpd(ymmA, ymmB, anyptr_gpC);
  a.vandnps(xmmA, xmmB, xmmC);
  a.vandnps(xmmA, xmmB, anyptr_gpC);
  a.vandnps(ymmA, ymmB, ymmC);
  a.vandnps(ymmA, ymmB, anyptr_gpC);
  a.vblendpd(xmmA, xmmB, xmmC, 0);
  a.vblendpd(xmmA, xmmB, anyptr_gpC, 0);
  a.vblendpd(ymmA, ymmB, ymmC, 0);
  a.vblendpd(ymmA, ymmB, anyptr_gpC, 0);
  a.vblendps(xmmA, xmmB, xmmC, 0);
  a.vblendps(xmmA, xmmB, anyptr_gpC, 0);
  a.vblendps(ymmA, ymmB, ymmC, 0);
  a.vblendps(ymmA, ymmB, anyptr_gpC, 0);
  a.vblendvpd(xmmA, xmmB, xmmC, xmmD);
  a.vblendvpd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vblendvpd(ymmA, ymmB, ymmC, ymmD);
  a.vblendvpd(ymmA, ymmB, anyptr_gpC, ymmD);
  a.vbroadcastf128(ymmA, anyptr_gpB);
  a.vbroadcastsd(ymmA, anyptr_gpB);
  a.vbroadcastss(xmmA, anyptr_gpB);
  a.vbroadcastss(ymmA, anyptr_gpB);
  a.vcmppd(xmmA, xmmB, xmmC, 0);
  a.vcmppd(xmmA, xmmB, anyptr_gpC, 0);
  a.vcmppd(ymmA, ymmB, ymmC, 0);
  a.vcmppd(ymmA, ymmB, anyptr_gpC, 0);
  a.vcmpps(xmmA, xmmB, xmmC, 0);
  a.vcmpps(xmmA, xmmB, anyptr_gpC, 0);
  a.vcmpps(ymmA, ymmB, ymmC, 0);
  a.vcmpps(ymmA, ymmB, anyptr_gpC, 0);
  a.vcmpsd(xmmA, xmmB, xmmC, 0);
  a.vcmpsd(xmmA, xmmB, anyptr_gpC, 0);
  a.vcmpss(xmmA, xmmB, xmmC, 0);
  a.vcmpss(xmmA, xmmB, anyptr_gpC, 0);
  a.vcomisd(xmmA, xmmB);
  a.vcomisd(xmmA, anyptr_gpB);
  a.vcomiss(xmmA, xmmB);
  a.vcomiss(xmmA, anyptr_gpB);
  a.vcvtdq2pd(xmmA, xmmB);
  a.vcvtdq2pd(xmmA, anyptr_gpB);
  a.vcvtdq2pd(ymmA, xmmB);
  a.vcvtdq2pd(ymmA, anyptr_gpB);
  a.vcvtdq2ps(xmmA, xmmB);
  a.vcvtdq2ps(xmmA, anyptr_gpB);
  a.vcvtdq2ps(ymmA, ymmB);
  a.vcvtdq2ps(ymmA, anyptr_gpB);
  a.vcvtpd2dq(xmmA, xmmB);
  a.vcvtpd2dq(xmmA, ymmB);
  a.vcvtpd2dq(xmmA, anyptr_gpB);
  a.vcvtpd2ps(xmmA, xmmB);
  a.vcvtpd2ps(xmmA, ymmB);
  a.vcvtpd2ps(xmmA, anyptr_gpB);
  a.vcvtps2dq(xmmA, xmmB);
  a.vcvtps2dq(xmmA, anyptr_gpB);
  a.vcvtps2dq(ymmA, ymmB);
  a.vcvtps2dq(ymmA, anyptr_gpB);
  a.vcvtps2pd(xmmA, xmmB);
  a.vcvtps2pd(xmmA, anyptr_gpB);
  a.vcvtps2pd(ymmA, xmmB);
  a.vcvtps2pd(ymmA, anyptr_gpB);
  a.vcvtsd2si(gzA, xmmB);
  a.vcvtsd2si(gzA, anyptr_gpB);
  a.vcvtsd2ss(xmmA, xmmB, xmmC);
  a.vcvtsd2ss(xmmA, xmmB, anyptr_gpC);
  a.vcvtsi2sd(xmmA, xmmB, gzC);
  a.vcvtsi2sd(xmmA, xmmB, anyptr_gpC);
  a.vcvtsi2ss(xmmA, xmmB, gzC);
  a.vcvtsi2ss(xmmA, xmmB, anyptr_gpC);
  a.vcvtss2sd(xmmA, xmmB, xmmC);
  a.vcvtss2sd(xmmA, xmmB, anyptr_gpC);
  a.vcvtss2si(gzA, xmmB);
  a.vcvtss2si(gzA, anyptr_gpB);
  a.vcvttpd2dq(xmmA, xmmB);
  a.vcvttpd2dq(xmmA, ymmB);
  a.vcvttpd2dq(xmmA, anyptr_gpB);
  a.vcvttps2dq(xmmA, xmmB);
  a.vcvttps2dq(xmmA, anyptr_gpB);
  a.vcvttps2dq(ymmA, ymmB);
  a.vcvttps2dq(ymmA, anyptr_gpB);
  a.vcvttsd2si(gzA, xmmB);
  a.vcvttsd2si(gzA, anyptr_gpB);
  a.vcvttss2si(gzA, xmmB);
  a.vcvttss2si(gzA, anyptr_gpB);
  a.vdivpd(xmmA, xmmB, xmmC);
  a.vdivpd(xmmA, xmmB, anyptr_gpC);
  a.vdivpd(ymmA, ymmB, ymmC);
  a.vdivpd(ymmA, ymmB, anyptr_gpC);
  a.vdivps(xmmA, xmmB, xmmC);
  a.vdivps(xmmA, xmmB, anyptr_gpC);
  a.vdivps(ymmA, ymmB, ymmC);
  a.vdivps(ymmA, ymmB, anyptr_gpC);
  a.vdivsd(xmmA, xmmB, xmmC);
  a.vdivsd(xmmA, xmmB, anyptr_gpC);
  a.vdivss(xmmA, xmmB, xmmC);
  a.vdivss(xmmA, xmmB, anyptr_gpC);
  a.vdppd(xmmA, xmmB, xmmC, 0);
  a.vdppd(xmmA, xmmB, anyptr_gpC, 0);
  a.vdpps(xmmA, xmmB, xmmC, 0);
  a.vdpps(xmmA, xmmB, anyptr_gpC, 0);
  a.vdpps(ymmA, ymmB, ymmC, 0);
  a.vdpps(ymmA, ymmB, anyptr_gpC, 0);
  a.vextractf128(xmmA, ymmB, 0);
  a.vextractf128(anyptr_gpA, ymmB, 0);
  a.vextractps(gzA, xmmB, 0);
  a.vextractps(anyptr_gpA, xmmB, 0);
  a.vhaddpd(xmmA, xmmB, xmmC);
  a.vhaddpd(xmmA, xmmB, anyptr_gpC);
  a.vhaddpd(ymmA, ymmB, ymmC);
  a.vhaddpd(ymmA, ymmB, anyptr_gpC);
  a.vhaddps(xmmA, xmmB, xmmC);
  a.vhaddps(xmmA, xmmB, anyptr_gpC);
  a.vhaddps(ymmA, ymmB, ymmC);
  a.vhaddps(ymmA, ymmB, anyptr_gpC);
  a.vhsubpd(xmmA, xmmB, xmmC);
  a.vhsubpd(xmmA, xmmB, anyptr_gpC);
  a.vhsubpd(ymmA, ymmB, ymmC);
  a.vhsubpd(ymmA, ymmB, anyptr_gpC);
  a.vhsubps(xmmA, xmmB, xmmC);
  a.vhsubps(xmmA, xmmB, anyptr_gpC);
  a.vhsubps(ymmA, ymmB, ymmC);
  a.vhsubps(ymmA, ymmB, anyptr_gpC);
  a.vinsertf128(ymmA, ymmB, xmmC, 0);
  a.vinsertf128(ymmA, ymmB, anyptr_gpC, 0);
  a.vinsertps(xmmA, xmmB, xmmC, 0);
  a.vinsertps(xmmA, xmmB, anyptr_gpC, 0);
  a.vlddqu(xmmA, anyptr_gpB);
  a.vlddqu(ymmA, anyptr_gpB);
  a.vldmxcsr(anyptr_gpA);
  a.vmaskmovdqu(xmmA, xmmB);
  a.vmaskmovps(xmmA, xmmB, anyptr_gpC);
  a.vmaskmovps(ymmA, ymmB, anyptr_gpC);
  a.vmaskmovps(anyptr_gpA, xmmB, xmmC);
  a.vmaskmovps(anyptr_gpA, ymmB, ymmC);
  a.vmaskmovpd(xmmA, xmmB, anyptr_gpC);
  a.vmaskmovpd(ymmA, ymmB, anyptr_gpC);
  a.vmaskmovpd(anyptr_gpA, xmmB, xmmC);
  a.vmaskmovpd(anyptr_gpA, ymmB, ymmC);
  a.vmaxpd(xmmA, xmmB, xmmC);
  a.vmaxpd(xmmA, xmmB, anyptr_gpC);
  a.vmaxpd(ymmA, ymmB, ymmC);
  a.vmaxpd(ymmA, ymmB, anyptr_gpC);
  a.vmaxps(xmmA, xmmB, xmmC);
  a.vmaxps(xmmA, xmmB, anyptr_gpC);
  a.vmaxps(ymmA, ymmB, ymmC);
  a.vmaxps(ymmA, ymmB, anyptr_gpC);
  a.vmaxsd(xmmA, xmmB, xmmC);
  a.vmaxsd(xmmA, xmmB, anyptr_gpC);
  a.vmaxss(xmmA, xmmB, xmmC);
  a.vmaxss(xmmA, xmmB, anyptr_gpC);
  a.vminpd(xmmA, xmmB, xmmC);
  a.vminpd(xmmA, xmmB, anyptr_gpC);
  a.vminpd(ymmA, ymmB, ymmC);
  a.vminpd(ymmA, ymmB, anyptr_gpC);
  a.vminps(xmmA, xmmB, xmmC);
  a.vminps(xmmA, xmmB, anyptr_gpC);
  a.vminps(ymmA, ymmB, ymmC);
  a.vminps(ymmA, ymmB, anyptr_gpC);
  a.vminsd(xmmA, xmmB, xmmC);
  a.vminsd(xmmA, xmmB, anyptr_gpC);
  a.vminss(xmmA, xmmB, xmmC);
  a.vminss(xmmA, xmmB, anyptr_gpC);
  a.vmovapd(xmmA, xmmB);
  a.vmovapd(xmmA, anyptr_gpB);
  a.vmovapd(anyptr_gpA, xmmB);
  a.vmovapd(ymmA, ymmB);
  a.vmovapd(ymmA, anyptr_gpB);
  a.vmovapd(anyptr_gpA, ymmB);
  a.vmovaps(xmmA, xmmB);
  a.vmovaps(xmmA, anyptr_gpB);
  a.vmovaps(anyptr_gpA, xmmB);
  a.vmovaps(ymmA, ymmB);
  a.vmovaps(ymmA, anyptr_gpB);
  a.vmovaps(anyptr_gpA, ymmB);
  a.vmovd(xmmA, gzB);
  a.vmovd(xmmA, anyptr_gpB);
  a.vmovd(gzA, xmmB);
  a.vmovd(anyptr_gpA, xmmB);
  a.vmovddup(xmmA, xmmB);
  a.vmovddup(xmmA, anyptr_gpB);
  a.vmovddup(ymmA, ymmB);
  a.vmovddup(ymmA, anyptr_gpB);
  a.vmovdqa(xmmA, xmmB);
  a.vmovdqa(xmmA, anyptr_gpB);
  a.vmovdqa(anyptr_gpA, xmmB);
  a.vmovdqa(ymmA, ymmB);
  a.vmovdqa(ymmA, anyptr_gpB);
  a.vmovdqa(anyptr_gpA, ymmB);
  a.vmovdqu(xmmA, xmmB);
  a.vmovdqu(xmmA, anyptr_gpB);
  a.vmovdqu(anyptr_gpA, xmmB);
  a.vmovdqu(ymmA, ymmB);
  a.vmovdqu(ymmA, anyptr_gpB);
  a.vmovdqu(anyptr_gpA, ymmB);
  a.vmovhlps(xmmA, xmmB, xmmC);
  a.vmovhpd(xmmA, xmmB, anyptr_gpC);
  a.vmovhpd(anyptr_gpA, xmmB);
  a.vmovhps(xmmA, xmmB, anyptr_gpC);
  a.vmovhps(anyptr_gpA, xmmB);
  a.vmovlhps(xmmA, xmmB, xmmC);
  a.vmovlpd(xmmA, xmmB, anyptr_gpC);
  a.vmovlpd(anyptr_gpA, xmmB);
  a.vmovlps(xmmA, xmmB, anyptr_gpC);
  a.vmovlps(anyptr_gpA, xmmB);
  a.vmovmskpd(gzA, xmmB);
  a.vmovmskpd(gzA, ymmB);
  a.vmovmskps(gzA, xmmB);
  a.vmovmskps(gzA, ymmB);
  a.vmovntdq(anyptr_gpA, xmmB);
  a.vmovntdq(anyptr_gpA, ymmB);
  a.vmovntdqa(xmmA, anyptr_gpB);
  a.vmovntpd(anyptr_gpA, xmmB);
  a.vmovntpd(anyptr_gpA, ymmB);
  a.vmovntps(anyptr_gpA, xmmB);
  a.vmovntps(anyptr_gpA, ymmB);
  a.vmovsd(xmmA, xmmB, xmmC);
  a.vmovsd(xmmA, anyptr_gpB);
  a.vmovsd(anyptr_gpA, xmmB);
  a.vmovshdup(xmmA, xmmB);
  a.vmovshdup(xmmA, anyptr_gpB);
  a.vmovshdup(ymmA, ymmB);
  a.vmovshdup(ymmA, anyptr_gpB);
  a.vmovsldup(xmmA, xmmB);
  a.vmovsldup(xmmA, anyptr_gpB);
  a.vmovsldup(ymmA, ymmB);
  a.vmovsldup(ymmA, anyptr_gpB);
  a.vmovss(xmmA, xmmB, xmmC);
  a.vmovss(xmmA, anyptr_gpB);
  a.vmovss(anyptr_gpA, xmmB);
  a.vmovupd(xmmA, xmmB);
  a.vmovupd(xmmA, anyptr_gpB);
  a.vmovupd(anyptr_gpA, xmmB);
  a.vmovupd(ymmA, ymmB);
  a.vmovupd(ymmA, anyptr_gpB);
  a.vmovupd(anyptr_gpA, ymmB);
  a.vmovups(xmmA, xmmB);
  a.vmovups(xmmA, anyptr_gpB);
  a.vmovups(anyptr_gpA, xmmB);
  a.vmovups(ymmA, ymmB);
  a.vmovups(ymmA, anyptr_gpB);
  a.vmovups(anyptr_gpA, ymmB);
  a.vmpsadbw(xmmA, xmmB, xmmC, 0);
  a.vmpsadbw(xmmA, xmmB, anyptr_gpC, 0);
  a.vmulpd(xmmA, xmmB, xmmC);
  a.vmulpd(xmmA, xmmB, anyptr_gpC);
  a.vmulpd(ymmA, ymmB, ymmC);
  a.vmulpd(ymmA, ymmB, anyptr_gpC);
  a.vmulps(xmmA, xmmB, xmmC);
  a.vmulps(xmmA, xmmB, anyptr_gpC);
  a.vmulps(ymmA, ymmB, ymmC);
  a.vmulps(ymmA, ymmB, anyptr_gpC);
  a.vmulsd(xmmA, xmmB, xmmC);
  a.vmulsd(xmmA, xmmB, anyptr_gpC);
  a.vmulss(xmmA, xmmB, xmmC);
  a.vmulss(xmmA, xmmB, anyptr_gpC);
  a.vorpd(xmmA, xmmB, xmmC);
  a.vorpd(xmmA, xmmB, anyptr_gpC);
  a.vorpd(ymmA, ymmB, ymmC);
  a.vorpd(ymmA, ymmB, anyptr_gpC);
  a.vorps(xmmA, xmmB, xmmC);
  a.vorps(xmmA, xmmB, anyptr_gpC);
  a.vorps(ymmA, ymmB, ymmC);
  a.vorps(ymmA, ymmB, anyptr_gpC);
  a.vpabsb(xmmA, xmmB);
  a.vpabsb(xmmA, anyptr_gpB);
  a.vpabsd(xmmA, xmmB);
  a.vpabsd(xmmA, anyptr_gpB);
  a.vpabsw(xmmA, xmmB);
  a.vpabsw(xmmA, anyptr_gpB);
  a.vpackssdw(xmmA, xmmB, xmmC);
  a.vpackssdw(xmmA, xmmB, anyptr_gpC);
  a.vpacksswb(xmmA, xmmB, xmmC);
  a.vpacksswb(xmmA, xmmB, anyptr_gpC);
  a.vpackusdw(xmmA, xmmB, xmmC);
  a.vpackusdw(xmmA, xmmB, anyptr_gpC);
  a.vpackuswb(xmmA, xmmB, xmmC);
  a.vpackuswb(xmmA, xmmB, anyptr_gpC);
  a.vpaddb(xmmA, xmmB, xmmC);
  a.vpaddb(xmmA, xmmB, anyptr_gpC);
  a.vpaddd(xmmA, xmmB, xmmC);
  a.vpaddd(xmmA, xmmB, anyptr_gpC);
  a.vpaddq(xmmA, xmmB, xmmC);
  a.vpaddq(xmmA, xmmB, anyptr_gpC);
  a.vpaddw(xmmA, xmmB, xmmC);
  a.vpaddw(xmmA, xmmB, anyptr_gpC);
  a.vpaddsb(xmmA, xmmB, xmmC);
  a.vpaddsb(xmmA, xmmB, anyptr_gpC);
  a.vpaddsw(xmmA, xmmB, xmmC);
  a.vpaddsw(xmmA, xmmB, anyptr_gpC);
  a.vpaddusb(xmmA, xmmB, xmmC);
  a.vpaddusb(xmmA, xmmB, anyptr_gpC);
  a.vpaddusw(xmmA, xmmB, xmmC);
  a.vpaddusw(xmmA, xmmB, anyptr_gpC);
  a.vpalignr(xmmA, xmmB, xmmC, 0);
  a.vpalignr(xmmA, xmmB, anyptr_gpC, 0);
  a.vpand(xmmA, xmmB, xmmC);
  a.vpand(xmmA, xmmB, anyptr_gpC);
  a.vpandn(xmmA, xmmB, xmmC);
  a.vpandn(xmmA, xmmB, anyptr_gpC);
  a.vpavgb(xmmA, xmmB, xmmC);
  a.vpavgb(xmmA, xmmB, anyptr_gpC);
  a.vpavgw(xmmA, xmmB, xmmC);
  a.vpavgw(xmmA, xmmB, anyptr_gpC);
  a.vpblendvb(xmmA, xmmB, xmmC, xmmD);
  a.vpblendvb(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vpblendw(xmmA, xmmB, xmmC, 0);
  a.vpblendw(xmmA, xmmB, anyptr_gpC, 0);
  a.vpcmpeqb(xmmA, xmmB, xmmC);
  a.vpcmpeqb(xmmA, xmmB, anyptr_gpC);
  a.vpcmpeqd(xmmA, xmmB, xmmC);
  a.vpcmpeqd(xmmA, xmmB, anyptr_gpC);
  a.vpcmpeqq(xmmA, xmmB, xmmC);
  a.vpcmpeqq(xmmA, xmmB, anyptr_gpC);
  a.vpcmpeqw(xmmA, xmmB, xmmC);
  a.vpcmpeqw(xmmA, xmmB, anyptr_gpC);
  a.vpcmpgtb(xmmA, xmmB, xmmC);
  a.vpcmpgtb(xmmA, xmmB, anyptr_gpC);
  a.vpcmpgtd(xmmA, xmmB, xmmC);
  a.vpcmpgtd(xmmA, xmmB, anyptr_gpC);
  a.vpcmpgtq(xmmA, xmmB, xmmC);
  a.vpcmpgtq(xmmA, xmmB, anyptr_gpC);
  a.vpcmpgtw(xmmA, xmmB, xmmC);
  a.vpcmpgtw(xmmA, xmmB, anyptr_gpC);
  a.vpcmpestri(xmmA, xmmB, 0);
  a.vpcmpestri(xmmA, anyptr_gpB, 0);
  a.vpcmpestrm(xmmA, xmmB, 0);
  a.vpcmpestrm(xmmA, anyptr_gpB, 0);
  a.vpcmpistri(xmmA, xmmB, 0);
  a.vpcmpistri(xmmA, anyptr_gpB, 0);
  a.vpcmpistrm(xmmA, xmmB, 0);
  a.vpcmpistrm(xmmA, anyptr_gpB, 0);
  a.vpermilpd(xmmA, xmmB, xmmC);
  a.vpermilpd(xmmA, xmmB, anyptr_gpC);
  a.vpermilpd(ymmA, ymmB, ymmC);
  a.vpermilpd(ymmA, ymmB, anyptr_gpC);
  a.vpermilpd(xmmA, xmmB, 0);
  a.vpermilpd(xmmA, anyptr_gpB, 0);
  a.vpermilpd(ymmA, ymmB, 0);
  a.vpermilpd(ymmA, anyptr_gpB, 0);
  a.vpermilps(xmmA, xmmB, xmmC);
  a.vpermilps(xmmA, xmmB, anyptr_gpC);
  a.vpermilps(ymmA, ymmB, ymmC);
  a.vpermilps(ymmA, ymmB, anyptr_gpC);
  a.vpermilps(xmmA, xmmB, 0);
  a.vpermilps(xmmA, anyptr_gpB, 0);
  a.vpermilps(ymmA, ymmB, 0);
  a.vpermilps(ymmA, anyptr_gpB, 0);
  a.vperm2f128(ymmA, ymmB, ymmC, 0);
  a.vperm2f128(ymmA, ymmB, anyptr_gpC, 0);
  a.vpextrb(gzA, xmmB, 0);
  a.vpextrb(anyptr_gpA, xmmB, 0);
  a.vpextrd(gzA, xmmB, 0);
  a.vpextrd(anyptr_gpA, xmmB, 0);
  a.vpextrw(gzA, xmmB, 0);
  a.vpextrw(anyptr_gpA, xmmB, 0);
  a.vphaddd(xmmA, xmmB, xmmC);
  a.vphaddd(xmmA, xmmB, anyptr_gpC);
  a.vphaddsw(xmmA, xmmB, xmmC);
  a.vphaddsw(xmmA, xmmB, anyptr_gpC);
  a.vphaddw(xmmA, xmmB, xmmC);
  a.vphaddw(xmmA, xmmB, anyptr_gpC);
  a.vphminposuw(xmmA, xmmB);
  a.vphminposuw(xmmA, anyptr_gpB);
  a.vphsubd(xmmA, xmmB, xmmC);
  a.vphsubd(xmmA, xmmB, anyptr_gpC);
  a.vphsubsw(xmmA, xmmB, xmmC);
  a.vphsubsw(xmmA, xmmB, anyptr_gpC);
  a.vphsubw(xmmA, xmmB, xmmC);
  a.vphsubw(xmmA, xmmB, anyptr_gpC);
  a.vpinsrb(xmmA, xmmB, gzC, 0);
  a.vpinsrb(xmmA, xmmB, anyptr_gpC, 0);
  a.vpinsrd(xmmA, xmmB, gzC, 0);
  a.vpinsrd(xmmA, xmmB, anyptr_gpC, 0);
  a.vpinsrw(xmmA, xmmB, gzC, 0);
  a.vpinsrw(xmmA, xmmB, anyptr_gpC, 0);
  a.vpmaddubsw(xmmA, xmmB, xmmC);
  a.vpmaddubsw(xmmA, xmmB, anyptr_gpC);
  a.vpmaddwd(xmmA, xmmB, xmmC);
  a.vpmaddwd(xmmA, xmmB, anyptr_gpC);
  a.vpmaxsb(xmmA, xmmB, xmmC);
  a.vpmaxsb(xmmA, xmmB, anyptr_gpC);
  a.vpmaxsd(xmmA, xmmB, xmmC);
  a.vpmaxsd(xmmA, xmmB, anyptr_gpC);
  a.vpmaxsw(xmmA, xmmB, xmmC);
  a.vpmaxsw(xmmA, xmmB, anyptr_gpC);
  a.vpmaxub(xmmA, xmmB, xmmC);
  a.vpmaxub(xmmA, xmmB, anyptr_gpC);
  a.vpmaxud(xmmA, xmmB, xmmC);
  a.vpmaxud(xmmA, xmmB, anyptr_gpC);
  a.vpmaxuw(xmmA, xmmB, xmmC);
  a.vpmaxuw(xmmA, xmmB, anyptr_gpC);
  a.vpminsb(xmmA, xmmB, xmmC);
  a.vpminsb(xmmA, xmmB, anyptr_gpC);
  a.vpminsd(xmmA, xmmB, xmmC);
  a.vpminsd(xmmA, xmmB, anyptr_gpC);
  a.vpminsw(xmmA, xmmB, xmmC);
  a.vpminsw(xmmA, xmmB, anyptr_gpC);
  a.vpminub(xmmA, xmmB, xmmC);
  a.vpminub(xmmA, xmmB, anyptr_gpC);
  a.vpminud(xmmA, xmmB, xmmC);
  a.vpminud(xmmA, xmmB, anyptr_gpC);
  a.vpminuw(xmmA, xmmB, xmmC);
  a.vpminuw(xmmA, xmmB, anyptr_gpC);
  a.vpmovmskb(gzA, xmmB);
  a.vpmovsxbd(xmmA, xmmB);
  a.vpmovsxbd(xmmA, anyptr_gpB);
  a.vpmovsxbq(xmmA, xmmB);
  a.vpmovsxbq(xmmA, anyptr_gpB);
  a.vpmovsxbw(xmmA, xmmB);
  a.vpmovsxbw(xmmA, anyptr_gpB);
  a.vpmovsxdq(xmmA, xmmB);
  a.vpmovsxdq(xmmA, anyptr_gpB);
  a.vpmovsxwd(xmmA, xmmB);
  a.vpmovsxwd(xmmA, anyptr_gpB);
  a.vpmovsxwq(xmmA, xmmB);
  a.vpmovsxwq(xmmA, anyptr_gpB);
  a.vpmovzxbd(xmmA, xmmB);
  a.vpmovzxbd(xmmA, anyptr_gpB);
  a.vpmovzxbq(xmmA, xmmB);
  a.vpmovzxbq(xmmA, anyptr_gpB);
  a.vpmovzxbw(xmmA, xmmB);
  a.vpmovzxbw(xmmA, anyptr_gpB);
  a.vpmovzxdq(xmmA, xmmB);
  a.vpmovzxdq(xmmA, anyptr_gpB);
  a.vpmovzxwd(xmmA, xmmB);
  a.vpmovzxwd(xmmA, anyptr_gpB);
  a.vpmovzxwq(xmmA, xmmB);
  a.vpmovzxwq(xmmA, anyptr_gpB);
  a.vpmuldq(xmmA, xmmB, xmmC);
  a.vpmuldq(xmmA, xmmB, anyptr_gpC);
  a.vpmulhrsw(xmmA, xmmB, xmmC);
  a.vpmulhrsw(xmmA, xmmB, anyptr_gpC);
  a.vpmulhuw(xmmA, xmmB, xmmC);
  a.vpmulhuw(xmmA, xmmB, anyptr_gpC);
  a.vpmulhw(xmmA, xmmB, xmmC);
  a.vpmulhw(xmmA, xmmB, anyptr_gpC);
  a.vpmulld(xmmA, xmmB, xmmC);
  a.vpmulld(xmmA, xmmB, anyptr_gpC);
  a.vpmullw(xmmA, xmmB, xmmC);
  a.vpmullw(xmmA, xmmB, anyptr_gpC);
  a.vpmuludq(xmmA, xmmB, xmmC);
  a.vpmuludq(xmmA, xmmB, anyptr_gpC);
  a.vpor(xmmA, xmmB, xmmC);
  a.vpor(xmmA, xmmB, anyptr_gpC);
  a.vpsadbw(xmmA, xmmB, xmmC);
  a.vpsadbw(xmmA, xmmB, anyptr_gpC);
  a.vpshufb(xmmA, xmmB, xmmC);
  a.vpshufb(xmmA, xmmB, anyptr_gpC);
  a.vpshufd(xmmA, xmmB, 0);
  a.vpshufd(xmmA, anyptr_gpB, 0);
  a.vpshufhw(xmmA, xmmB, 0);
  a.vpshufhw(xmmA, anyptr_gpB, 0);
  a.vpshuflw(xmmA, xmmB, 0);
  a.vpshuflw(xmmA, anyptr_gpB, 0);
  a.vpsignb(xmmA, xmmB, xmmC);
  a.vpsignb(xmmA, xmmB, anyptr_gpC);
  a.vpsignd(xmmA, xmmB, xmmC);
  a.vpsignd(xmmA, xmmB, anyptr_gpC);
  a.vpsignw(xmmA, xmmB, xmmC);
  a.vpsignw(xmmA, xmmB, anyptr_gpC);
  a.vpslld(xmmA, xmmB, xmmC);
  a.vpslld(xmmA, xmmB, anyptr_gpC);
  a.vpslld(xmmA, xmmB, 0);
  a.vpslldq(xmmA, xmmB, 0);
  a.vpsllq(xmmA, xmmB, xmmC);
  a.vpsllq(xmmA, xmmB, anyptr_gpC);
  a.vpsllq(xmmA, xmmB, 0);
  a.vpsllw(xmmA, xmmB, xmmC);
  a.vpsllw(xmmA, xmmB, anyptr_gpC);
  a.vpsllw(xmmA, xmmB, 0);
  a.vpsrad(xmmA, xmmB, xmmC);
  a.vpsrad(xmmA, xmmB, anyptr_gpC);
  a.vpsrad(xmmA, xmmB, 0);
  a.vpsraw(xmmA, xmmB, xmmC);
  a.vpsraw(xmmA, xmmB, anyptr_gpC);
  a.vpsraw(xmmA, xmmB, 0);
  a.vpsrld(xmmA, xmmB, xmmC);
  a.vpsrld(xmmA, xmmB, anyptr_gpC);
  a.vpsrld(xmmA, xmmB, 0);
  a.vpsrldq(xmmA, xmmB, 0);
  a.vpsrlq(xmmA, xmmB, xmmC);
  a.vpsrlq(xmmA, xmmB, anyptr_gpC);
  a.vpsrlq(xmmA, xmmB, 0);
  a.vpsrlw(xmmA, xmmB, xmmC);
  a.vpsrlw(xmmA, xmmB, anyptr_gpC);
  a.vpsrlw(xmmA, xmmB, 0);
  a.vpsubb(xmmA, xmmB, xmmC);
  a.vpsubb(xmmA, xmmB, anyptr_gpC);
  a.vpsubd(xmmA, xmmB, xmmC);
  a.vpsubd(xmmA, xmmB, anyptr_gpC);
  a.vpsubq(xmmA, xmmB, xmmC);
  a.vpsubq(xmmA, xmmB, anyptr_gpC);
  a.vpsubw(xmmA, xmmB, xmmC);
  a.vpsubw(xmmA, xmmB, anyptr_gpC);
  a.vpsubsb(xmmA, xmmB, xmmC);
  a.vpsubsb(xmmA, xmmB, anyptr_gpC);
  a.vpsubsw(xmmA, xmmB, xmmC);
  a.vpsubsw(xmmA, xmmB, anyptr_gpC);
  a.vpsubusb(xmmA, xmmB, xmmC);
  a.vpsubusb(xmmA, xmmB, anyptr_gpC);
  a.vpsubusw(xmmA, xmmB, xmmC);
  a.vpsubusw(xmmA, xmmB, anyptr_gpC);
  a.vptest(xmmA, xmmB);
  a.vptest(xmmA, anyptr_gpB);
  a.vptest(ymmA, ymmB);
  a.vptest(ymmA, anyptr_gpB);
  a.vpunpckhbw(xmmA, xmmB, xmmC);
  a.vpunpckhbw(xmmA, xmmB, anyptr_gpC);
  a.vpunpckhdq(xmmA, xmmB, xmmC);
  a.vpunpckhdq(xmmA, xmmB, anyptr_gpC);
  a.vpunpckhqdq(xmmA, xmmB, xmmC);
  a.vpunpckhqdq(xmmA, xmmB, anyptr_gpC);
  a.vpunpckhwd(xmmA, xmmB, xmmC);
  a.vpunpckhwd(xmmA, xmmB, anyptr_gpC);
  a.vpunpcklbw(xmmA, xmmB, xmmC);
  a.vpunpcklbw(xmmA, xmmB, anyptr_gpC);
  a.vpunpckldq(xmmA, xmmB, xmmC);
  a.vpunpckldq(xmmA, xmmB, anyptr_gpC);
  a.vpunpcklqdq(xmmA, xmmB, xmmC);
  a.vpunpcklqdq(xmmA, xmmB, anyptr_gpC);
  a.vpunpcklwd(xmmA, xmmB, xmmC);
  a.vpunpcklwd(xmmA, xmmB, anyptr_gpC);
  a.vpxor(xmmA, xmmB, xmmC);
  a.vpxor(xmmA, xmmB, anyptr_gpC);
  a.vrcpps(xmmA, xmmB);
  a.vrcpps(xmmA, anyptr_gpB);
  a.vrcpps(ymmA, ymmB);
  a.vrcpps(ymmA, anyptr_gpB);
  a.vrcpss(xmmA, xmmB, xmmC);
  a.vrcpss(xmmA, xmmB, anyptr_gpC);
  a.vrsqrtps(xmmA, xmmB);
  a.vrsqrtps(xmmA, anyptr_gpB);
  a.vrsqrtps(ymmA, ymmB);
  a.vrsqrtps(ymmA, anyptr_gpB);
  a.vrsqrtss(xmmA, xmmB, xmmC);
  a.vrsqrtss(xmmA, xmmB, anyptr_gpC);
  a.vroundpd(xmmA, xmmB, 0);
  a.vroundpd(xmmA, anyptr_gpB, 0);
  a.vroundpd(ymmA, ymmB, 0);
  a.vroundpd(ymmA, anyptr_gpB, 0);
  a.vroundps(xmmA, xmmB, 0);
  a.vroundps(xmmA, anyptr_gpB, 0);
  a.vroundps(ymmA, ymmB, 0);
  a.vroundps(ymmA, anyptr_gpB, 0);
  a.vroundsd(xmmA, xmmB, xmmC, 0);
  a.vroundsd(xmmA, xmmB, anyptr_gpC, 0);
  a.vroundss(xmmA, xmmB, xmmC, 0);
  a.vroundss(xmmA, xmmB, anyptr_gpC, 0);
  a.vshufpd(xmmA, xmmB, xmmC, 0);
  a.vshufpd(xmmA, xmmB, anyptr_gpC, 0);
  a.vshufpd(ymmA, ymmB, ymmC, 0);
  a.vshufpd(ymmA, ymmB, anyptr_gpC, 0);
  a.vshufps(xmmA, xmmB, xmmC, 0);
  a.vshufps(xmmA, xmmB, anyptr_gpC, 0);
  a.vshufps(ymmA, ymmB, ymmC, 0);
  a.vshufps(ymmA, ymmB, anyptr_gpC, 0);
  a.vsqrtpd(xmmA, xmmB);
  a.vsqrtpd(xmmA, anyptr_gpB);
  a.vsqrtpd(ymmA, ymmB);
  a.vsqrtpd(ymmA, anyptr_gpB);
  a.vsqrtps(xmmA, xmmB);
  a.vsqrtps(xmmA, anyptr_gpB);
  a.vsqrtps(ymmA, ymmB);
  a.vsqrtps(ymmA, anyptr_gpB);
  a.vsqrtsd(xmmA, xmmB, xmmC);
  a.vsqrtsd(xmmA, xmmB, anyptr_gpC);
  a.vsqrtss(xmmA, xmmB, xmmC);
  a.vsqrtss(xmmA, xmmB, anyptr_gpC);
  a.vstmxcsr(anyptr_gpA);
  a.vsubpd(xmmA, xmmB, xmmC);
  a.vsubpd(xmmA, xmmB, anyptr_gpC);
  a.vsubpd(ymmA, ymmB, ymmC);
  a.vsubpd(ymmA, ymmB, anyptr_gpC);
  a.vsubps(xmmA, xmmB, xmmC);
  a.vsubps(xmmA, xmmB, anyptr_gpC);
  a.vsubps(ymmA, ymmB, ymmC);
  a.vsubps(ymmA, ymmB, anyptr_gpC);
  a.vsubsd(xmmA, xmmB, xmmC);
  a.vsubsd(xmmA, xmmB, anyptr_gpC);
  a.vsubss(xmmA, xmmB, xmmC);
  a.vsubss(xmmA, xmmB, anyptr_gpC);
  a.vtestps(xmmA, xmmB);
  a.vtestps(xmmA, anyptr_gpB);
  a.vtestps(ymmA, ymmB);
  a.vtestps(ymmA, anyptr_gpB);
  a.vtestpd(xmmA, xmmB);
  a.vtestpd(xmmA, anyptr_gpB);
  a.vtestpd(ymmA, ymmB);
  a.vtestpd(ymmA, anyptr_gpB);
  a.vucomisd(xmmA, xmmB);
  a.vucomisd(xmmA, anyptr_gpB);
  a.vucomiss(xmmA, xmmB);
  a.vucomiss(xmmA, anyptr_gpB);
  a.vunpckhpd(xmmA, xmmB, xmmC);
  a.vunpckhpd(xmmA, xmmB, anyptr_gpC);
  a.vunpckhpd(ymmA, ymmB, ymmC);
  a.vunpckhpd(ymmA, ymmB, anyptr_gpC);
  a.vunpckhps(xmmA, xmmB, xmmC);
  a.vunpckhps(xmmA, xmmB, anyptr_gpC);
  a.vunpckhps(ymmA, ymmB, ymmC);
  a.vunpckhps(ymmA, ymmB, anyptr_gpC);
  a.vunpcklpd(xmmA, xmmB, xmmC);
  a.vunpcklpd(xmmA, xmmB, anyptr_gpC);
  a.vunpcklpd(ymmA, ymmB, ymmC);
  a.vunpcklpd(ymmA, ymmB, anyptr_gpC);
  a.vunpcklps(xmmA, xmmB, xmmC);
  a.vunpcklps(xmmA, xmmB, anyptr_gpC);
  a.vunpcklps(ymmA, ymmB, ymmC);
  a.vunpcklps(ymmA, ymmB, anyptr_gpC);
  a.vxorpd(xmmA, xmmB, xmmC);
  a.vxorpd(xmmA, xmmB, anyptr_gpC);
  a.vxorpd(ymmA, ymmB, ymmC);
  a.vxorpd(ymmA, ymmB, anyptr_gpC);
  a.vxorps(xmmA, xmmB, xmmC);
  a.vxorps(xmmA, xmmB, anyptr_gpC);
  a.vxorps(ymmA, ymmB, ymmC);
  a.vxorps(ymmA, ymmB, anyptr_gpC);
  a.vzeroall();
  a.vzeroupper();

  // AVX+AESNI.
  a.nop();

  a.vaesdec(xmmA, xmmB, xmmC);
  a.vaesdec(xmmA, xmmB, anyptr_gpC);
  a.vaesdeclast(xmmA, xmmB, xmmC);
  a.vaesdeclast(xmmA, xmmB, anyptr_gpC);
  a.vaesenc(xmmA, xmmB, xmmC);
  a.vaesenc(xmmA, xmmB, anyptr_gpC);
  a.vaesenclast(xmmA, xmmB, xmmC);
  a.vaesenclast(xmmA, xmmB, anyptr_gpC);
  a.vaesimc(xmmA, xmmB);
  a.vaesimc(xmmA, anyptr_gpB);
  a.vaeskeygenassist(xmmA, xmmB, 0);
  a.vaeskeygenassist(xmmA, anyptr_gpB, 0);

  // AVX+PCLMULQDQ.
  a.nop();

  a.vpclmulqdq(xmmA, xmmB, xmmC, 0);
  a.vpclmulqdq(xmmA, xmmB, anyptr_gpC, 0);

  // AVX2.
  a.nop();

  a.vbroadcasti128(ymmA, anyptr_gpB);
  a.vbroadcastsd(ymmA, xmmB);
  a.vbroadcastss(xmmA, xmmB);
  a.vbroadcastss(ymmA, xmmB);
  a.vextracti128(xmmA, ymmB, 0);
  a.vextracti128(anyptr_gpA, ymmB, 0);
  a.vgatherdpd(xmmA, vmxptr_gpB, xmmC);
  a.vgatherdpd(ymmA, vmyptr_gpB, ymmC);
  a.vgatherdps(xmmA, vmxptr_gpB, xmmC);
  a.vgatherdps(ymmA, vmyptr_gpB, ymmC);
  a.vgatherqpd(xmmA, vmxptr_gpB, xmmC);
  a.vgatherqpd(ymmA, vmyptr_gpB, ymmC);
  a.vgatherqps(xmmA, vmxptr_gpB, xmmC);
  a.vgatherqps(xmmA, vmyptr_gpB, xmmC);
  a.vinserti128(ymmA, ymmB, xmmC, 0);
  a.vinserti128(ymmA, ymmB, anyptr_gpC, 0);
  a.vmovntdqa(ymmA, anyptr_gpB);
  a.vmpsadbw(ymmA, ymmB, ymmC, 0);
  a.vmpsadbw(ymmA, ymmB, anyptr_gpC, 0);
  a.vpabsb(ymmA, ymmB);
  a.vpabsb(ymmA, anyptr_gpB);
  a.vpabsd(ymmA, ymmB);
  a.vpabsd(ymmA, anyptr_gpB);
  a.vpabsw(ymmA, ymmB);
  a.vpabsw(ymmA, anyptr_gpB);
  a.vpackssdw(ymmA, ymmB, ymmC);
  a.vpackssdw(ymmA, ymmB, anyptr_gpC);
  a.vpacksswb(ymmA, ymmB, ymmC);
  a.vpacksswb(ymmA, ymmB, anyptr_gpC);
  a.vpackusdw(ymmA, ymmB, ymmC);
  a.vpackusdw(ymmA, ymmB, anyptr_gpC);
  a.vpackuswb(ymmA, ymmB, ymmC);
  a.vpackuswb(ymmA, ymmB, anyptr_gpC);
  a.vpaddb(ymmA, ymmB, ymmC);
  a.vpaddb(ymmA, ymmB, anyptr_gpC);
  a.vpaddd(ymmA, ymmB, ymmC);
  a.vpaddd(ymmA, ymmB, anyptr_gpC);
  a.vpaddq(ymmA, ymmB, ymmC);
  a.vpaddq(ymmA, ymmB, anyptr_gpC);
  a.vpaddw(ymmA, ymmB, ymmC);
  a.vpaddw(ymmA, ymmB, anyptr_gpC);
  a.vpaddsb(ymmA, ymmB, ymmC);
  a.vpaddsb(ymmA, ymmB, anyptr_gpC);
  a.vpaddsw(ymmA, ymmB, ymmC);
  a.vpaddsw(ymmA, ymmB, anyptr_gpC);
  a.vpaddusb(ymmA, ymmB, ymmC);
  a.vpaddusb(ymmA, ymmB, anyptr_gpC);
  a.vpaddusw(ymmA, ymmB, ymmC);
  a.vpaddusw(ymmA, ymmB, anyptr_gpC);
  a.vpalignr(ymmA, ymmB, ymmC, 0);
  a.vpalignr(ymmA, ymmB, anyptr_gpC, 0);
  a.vpand(ymmA, ymmB, ymmC);
  a.vpand(ymmA, ymmB, anyptr_gpC);
  a.vpandn(ymmA, ymmB, ymmC);
  a.vpandn(ymmA, ymmB, anyptr_gpC);
  a.vpavgb(ymmA, ymmB, ymmC);
  a.vpavgb(ymmA, ymmB, anyptr_gpC);
  a.vpavgw(ymmA, ymmB, ymmC);
  a.vpavgw(ymmA, ymmB, anyptr_gpC);
  a.vpblendd(xmmA, xmmB, xmmC, 0);
  a.vpblendd(xmmA, xmmB, anyptr_gpC, 0);
  a.vpblendd(ymmA, ymmB, ymmC, 0);
  a.vpblendd(ymmA, ymmB, anyptr_gpC, 0);
  a.vpblendvb(ymmA, ymmB, ymmC, ymmD);
  a.vpblendvb(ymmA, ymmB, anyptr_gpC, ymmD);
  a.vpblendw(ymmA, ymmB, ymmC, 0);
  a.vpblendw(ymmA, ymmB, anyptr_gpC, 0);
  a.vpbroadcastb(xmmA, xmmB);
  a.vpbroadcastb(xmmA, anyptr_gpB);
  a.vpbroadcastb(ymmA, xmmB);
  a.vpbroadcastb(ymmA, anyptr_gpB);
  a.vpbroadcastd(xmmA, xmmB);
  a.vpbroadcastd(xmmA, anyptr_gpB);
  a.vpbroadcastd(ymmA, xmmB);
  a.vpbroadcastd(ymmA, anyptr_gpB);
  a.vpbroadcastq(xmmA, xmmB);
  a.vpbroadcastq(xmmA, anyptr_gpB);
  a.vpbroadcastq(ymmA, xmmB);
  a.vpbroadcastq(ymmA, anyptr_gpB);
  a.vpbroadcastw(xmmA, xmmB);
  a.vpbroadcastw(xmmA, anyptr_gpB);
  a.vpbroadcastw(ymmA, xmmB);
  a.vpbroadcastw(ymmA, anyptr_gpB);
  a.vpcmpeqb(ymmA, ymmB, ymmC);
  a.vpcmpeqb(ymmA, ymmB, anyptr_gpC);
  a.vpcmpeqd(ymmA, ymmB, ymmC);
  a.vpcmpeqd(ymmA, ymmB, anyptr_gpC);
  a.vpcmpeqq(ymmA, ymmB, ymmC);
  a.vpcmpeqq(ymmA, ymmB, anyptr_gpC);
  a.vpcmpeqw(ymmA, ymmB, ymmC);
  a.vpcmpeqw(ymmA, ymmB, anyptr_gpC);
  a.vpcmpgtb(ymmA, ymmB, ymmC);
  a.vpcmpgtb(ymmA, ymmB, anyptr_gpC);
  a.vpcmpgtd(ymmA, ymmB, ymmC);
  a.vpcmpgtd(ymmA, ymmB, anyptr_gpC);
  a.vpcmpgtq(ymmA, ymmB, ymmC);
  a.vpcmpgtq(ymmA, ymmB, anyptr_gpC);
  a.vpcmpgtw(ymmA, ymmB, ymmC);
  a.vpcmpgtw(ymmA, ymmB, anyptr_gpC);
  a.vperm2i128(ymmA, ymmB, ymmC, 0);
  a.vperm2i128(ymmA, ymmB, anyptr_gpC, 0);
  a.vpermd(ymmA, ymmB, ymmC);
  a.vpermd(ymmA, ymmB, anyptr_gpC);
  a.vpermps(ymmA, ymmB, ymmC);
  a.vpermps(ymmA, ymmB, anyptr_gpC);
  a.vpermpd(ymmA, ymmB, 0);
  a.vpermpd(ymmA, anyptr_gpB, 0);
  a.vpermq(ymmA, ymmB, 0);
  a.vpermq(ymmA, anyptr_gpB, 0);
  a.vpgatherdd(xmmA, vmxptr_gpB, xmmC);
  a.vpgatherdd(ymmA, vmyptr_gpB, ymmC);
  a.vpgatherdq(xmmA, vmxptr_gpB, xmmC);
  a.vpgatherdq(ymmA, vmyptr_gpB, ymmC);
  a.vpgatherqd(xmmA, vmxptr_gpB, xmmC);
  a.vpgatherqd(xmmA, vmyptr_gpB, xmmC);
  a.vpgatherqq(xmmA, vmxptr_gpB, xmmC);
  a.vpgatherqq(ymmA, vmyptr_gpB, ymmC);
  a.vpmovmskb(gzA, ymmB);
  a.vpmovsxbd(ymmA, anyptr_gpB);
  a.vpmovsxbd(ymmA, xmmB);
  a.vpmovsxbq(ymmA, anyptr_gpB);
  a.vpmovsxbq(ymmA, xmmB);
  a.vpmovsxbw(ymmA, anyptr_gpB);
  a.vpmovsxbw(ymmA, xmmB);
  a.vpmovsxdq(ymmA, anyptr_gpB);
  a.vpmovsxdq(ymmA, xmmB);
  a.vpmovsxwd(ymmA, anyptr_gpB);
  a.vpmovsxwd(ymmA, xmmB);
  a.vpmovsxwq(ymmA, anyptr_gpB);
  a.vpmovsxwq(ymmA, xmmB);
  a.vpmovzxbd(ymmA, anyptr_gpB);
  a.vpmovzxbd(ymmA, xmmB);
  a.vpmovzxbq(ymmA, anyptr_gpB);
  a.vpmovzxbq(ymmA, xmmB);
  a.vpmovzxbw(ymmA, anyptr_gpB);
  a.vpmovzxbw(ymmA, xmmB);
  a.vpmovzxdq(ymmA, anyptr_gpB);
  a.vpmovzxdq(ymmA, xmmB);
  a.vpmovzxwd(ymmA, anyptr_gpB);
  a.vpmovzxwd(ymmA, xmmB);
  a.vpmovzxwq(ymmA, anyptr_gpB);
  a.vpmovzxwq(ymmA, xmmB);
  a.vpshufd(ymmA, anyptr_gpB, 0);
  a.vpshufd(ymmA, ymmB, 0);
  a.vpshufhw(ymmA, anyptr_gpB, 0);
  a.vpshufhw(ymmA, ymmB, 0);
  a.vpshuflw(ymmA, anyptr_gpB, 0);
  a.vpshuflw(ymmA, ymmB, 0);
  a.vpslld(ymmA, ymmB, 0);
  a.vpslldq(ymmA, ymmB, 0);
  a.vpsllq(ymmA, ymmB, 0);
  a.vpsllw(ymmA, ymmB, 0);
  a.vpsrad(ymmA, ymmB, 0);
  a.vpsraw(ymmA, ymmB, 0);
  a.vpsrld(ymmA, ymmB, 0);
  a.vpsrldq(ymmA, ymmB, 0);
  a.vpsrlq(ymmA, ymmB, 0);
  a.vpsrlw(ymmA, ymmB, 0);
  a.vphaddd(ymmA, ymmB, anyptr_gpC);
  a.vphaddd(ymmA, ymmB, ymmC);
  a.vphaddsw(ymmA, ymmB, anyptr_gpC);
  a.vphaddsw(ymmA, ymmB, ymmC);
  a.vphaddw(ymmA, ymmB, anyptr_gpC);
  a.vphaddw(ymmA, ymmB, ymmC);
  a.vphsubd(ymmA, ymmB, anyptr_gpC);
  a.vphsubd(ymmA, ymmB, ymmC);
  a.vphsubsw(ymmA, ymmB, anyptr_gpC);
  a.vphsubsw(ymmA, ymmB, ymmC);
  a.vphsubw(ymmA, ymmB, anyptr_gpC);
  a.vphsubw(ymmA, ymmB, ymmC);
  a.vpmaddubsw(ymmA, ymmB, anyptr_gpC);
  a.vpmaddubsw(ymmA, ymmB, ymmC);
  a.vpmaddwd(ymmA, ymmB, anyptr_gpC);
  a.vpmaddwd(ymmA, ymmB, ymmC);
  a.vpmaskmovd(anyptr_gpA, xmmB, xmmC);
  a.vpmaskmovd(anyptr_gpA, ymmB, ymmC);
  a.vpmaskmovd(xmmA, xmmB, anyptr_gpC);
  a.vpmaskmovd(ymmA, ymmB, anyptr_gpC);
  a.vpmaskmovq(anyptr_gpA, xmmB, xmmC);
  a.vpmaskmovq(anyptr_gpA, ymmB, ymmC);
  a.vpmaskmovq(xmmA, xmmB, anyptr_gpC);
  a.vpmaskmovq(ymmA, ymmB, anyptr_gpC);
  a.vpmaxsb(ymmA, ymmB, anyptr_gpC);
  a.vpmaxsb(ymmA, ymmB, ymmC);
  a.vpmaxsd(ymmA, ymmB, anyptr_gpC);
  a.vpmaxsd(ymmA, ymmB, ymmC);
  a.vpmaxsw(ymmA, ymmB, anyptr_gpC);
  a.vpmaxsw(ymmA, ymmB, ymmC);
  a.vpmaxub(ymmA, ymmB, anyptr_gpC);
  a.vpmaxub(ymmA, ymmB, ymmC);
  a.vpmaxud(ymmA, ymmB, anyptr_gpC);
  a.vpmaxud(ymmA, ymmB, ymmC);
  a.vpmaxuw(ymmA, ymmB, anyptr_gpC);
  a.vpmaxuw(ymmA, ymmB, ymmC);
  a.vpminsb(ymmA, ymmB, anyptr_gpC);
  a.vpminsb(ymmA, ymmB, ymmC);
  a.vpminsd(ymmA, ymmB, anyptr_gpC);
  a.vpminsd(ymmA, ymmB, ymmC);
  a.vpminsw(ymmA, ymmB, anyptr_gpC);
  a.vpminsw(ymmA, ymmB, ymmC);
  a.vpminub(ymmA, ymmB, anyptr_gpC);
  a.vpminub(ymmA, ymmB, ymmC);
  a.vpminud(ymmA, ymmB, anyptr_gpC);
  a.vpminud(ymmA, ymmB, ymmC);
  a.vpminuw(ymmA, ymmB, anyptr_gpC);
  a.vpminuw(ymmA, ymmB, ymmC);
  a.vpmuldq(ymmA, ymmB, anyptr_gpC);
  a.vpmuldq(ymmA, ymmB, ymmC);
  a.vpmulhrsw(ymmA, ymmB, anyptr_gpC);
  a.vpmulhrsw(ymmA, ymmB, ymmC);
  a.vpmulhuw(ymmA, ymmB, anyptr_gpC);
  a.vpmulhuw(ymmA, ymmB, ymmC);
  a.vpmulhw(ymmA, ymmB, anyptr_gpC);
  a.vpmulhw(ymmA, ymmB, ymmC);
  a.vpmulld(ymmA, ymmB, anyptr_gpC);
  a.vpmulld(ymmA, ymmB, ymmC);
  a.vpmullw(ymmA, ymmB, anyptr_gpC);
  a.vpmullw(ymmA, ymmB, ymmC);
  a.vpmuludq(ymmA, ymmB, anyptr_gpC);
  a.vpmuludq(ymmA, ymmB, ymmC);
  a.vpor(ymmA, ymmB, anyptr_gpC);
  a.vpor(ymmA, ymmB, ymmC);
  a.vpsadbw(ymmA, ymmB, anyptr_gpC);
  a.vpsadbw(ymmA, ymmB, ymmC);
  a.vpshufb(ymmA, ymmB, anyptr_gpC);
  a.vpshufb(ymmA, ymmB, ymmC);
  a.vpsignb(ymmA, ymmB, anyptr_gpC);
  a.vpsignb(ymmA, ymmB, ymmC);
  a.vpsignd(ymmA, ymmB, anyptr_gpC);
  a.vpsignd(ymmA, ymmB, ymmC);
  a.vpsignw(ymmA, ymmB, anyptr_gpC);
  a.vpsignw(ymmA, ymmB, ymmC);
  a.vpslld(ymmA, ymmB, anyptr_gpC);
  a.vpslld(ymmA, ymmB, xmmC);
  a.vpsllq(ymmA, ymmB, anyptr_gpC);
  a.vpsllq(ymmA, ymmB, xmmC);
  a.vpsllvd(xmmA, xmmB, anyptr_gpC);
  a.vpsllvd(xmmA, xmmB, xmmC);
  a.vpsllvd(ymmA, ymmB, anyptr_gpC);
  a.vpsllvd(ymmA, ymmB, ymmC);
  a.vpsllvq(xmmA, xmmB, anyptr_gpC);
  a.vpsllvq(xmmA, xmmB, xmmC);
  a.vpsllvq(ymmA, ymmB, anyptr_gpC);
  a.vpsllvq(ymmA, ymmB, ymmC);
  a.vpsllw(ymmA, ymmB, anyptr_gpC);
  a.vpsllw(ymmA, ymmB, xmmC);
  a.vpsrad(ymmA, ymmB, anyptr_gpC);
  a.vpsrad(ymmA, ymmB, xmmC);
  a.vpsravd(xmmA, xmmB, anyptr_gpC);
  a.vpsravd(xmmA, xmmB, xmmC);
  a.vpsravd(ymmA, ymmB, anyptr_gpC);
  a.vpsravd(ymmA, ymmB, ymmC);
  a.vpsraw(ymmA, ymmB, anyptr_gpC);
  a.vpsraw(ymmA, ymmB, xmmC);
  a.vpsrld(ymmA, ymmB, anyptr_gpC);
  a.vpsrld(ymmA, ymmB, xmmC);
  a.vpsrlq(ymmA, ymmB, anyptr_gpC);
  a.vpsrlq(ymmA, ymmB, xmmC);
  a.vpsrlvd(xmmA, xmmB, anyptr_gpC);
  a.vpsrlvd(xmmA, xmmB, xmmC);
  a.vpsrlvd(ymmA, ymmB, anyptr_gpC);
  a.vpsrlvd(ymmA, ymmB, ymmC);
  a.vpsrlvq(xmmA, xmmB, anyptr_gpC);
  a.vpsrlvq(xmmA, xmmB, xmmC);
  a.vpsrlvq(ymmA, ymmB, anyptr_gpC);
  a.vpsrlvq(ymmA, ymmB, ymmC);
  a.vpsrlw(ymmA, ymmB, anyptr_gpC);
  a.vpsrlw(ymmA, ymmB, xmmC);
  a.vpsubb(ymmA, ymmB, anyptr_gpC);
  a.vpsubb(ymmA, ymmB, ymmC);
  a.vpsubd(ymmA, ymmB, anyptr_gpC);
  a.vpsubd(ymmA, ymmB, ymmC);
  a.vpsubq(ymmA, ymmB, anyptr_gpC);
  a.vpsubq(ymmA, ymmB, ymmC);
  a.vpsubsb(ymmA, ymmB, anyptr_gpC);
  a.vpsubsb(ymmA, ymmB, ymmC);
  a.vpsubsw(ymmA, ymmB, anyptr_gpC);
  a.vpsubsw(ymmA, ymmB, ymmC);
  a.vpsubusb(ymmA, ymmB, anyptr_gpC);
  a.vpsubusb(ymmA, ymmB, ymmC);
  a.vpsubusw(ymmA, ymmB, anyptr_gpC);
  a.vpsubusw(ymmA, ymmB, ymmC);
  a.vpsubw(ymmA, ymmB, anyptr_gpC);
  a.vpsubw(ymmA, ymmB, ymmC);
  a.vpunpckhbw(ymmA, ymmB, anyptr_gpC);
  a.vpunpckhbw(ymmA, ymmB, ymmC);
  a.vpunpckhdq(ymmA, ymmB, anyptr_gpC);
  a.vpunpckhdq(ymmA, ymmB, ymmC);
  a.vpunpckhqdq(ymmA, ymmB, anyptr_gpC);
  a.vpunpckhqdq(ymmA, ymmB, ymmC);
  a.vpunpckhwd(ymmA, ymmB, anyptr_gpC);
  a.vpunpckhwd(ymmA, ymmB, ymmC);
  a.vpunpcklbw(ymmA, ymmB, anyptr_gpC);
  a.vpunpcklbw(ymmA, ymmB, ymmC);
  a.vpunpckldq(ymmA, ymmB, anyptr_gpC);
  a.vpunpckldq(ymmA, ymmB, ymmC);
  a.vpunpcklqdq(ymmA, ymmB, anyptr_gpC);
  a.vpunpcklqdq(ymmA, ymmB, ymmC);
  a.vpunpcklwd(ymmA, ymmB, anyptr_gpC);
  a.vpunpcklwd(ymmA, ymmB, ymmC);
  a.vpxor(ymmA, ymmB, anyptr_gpC);
  a.vpxor(ymmA, ymmB, ymmC);

  // FMA3.
  a.nop();

  a.vfmadd132pd(xmmA, xmmB, anyptr_gpC);
  a.vfmadd132pd(xmmA, xmmB, xmmC);
  a.vfmadd132pd(ymmA, ymmB, anyptr_gpC);
  a.vfmadd132pd(ymmA, ymmB, ymmC);
  a.vfmadd132ps(xmmA, xmmB, anyptr_gpC);
  a.vfmadd132ps(xmmA, xmmB, xmmC);
  a.vfmadd132ps(ymmA, ymmB, anyptr_gpC);
  a.vfmadd132ps(ymmA, ymmB, ymmC);
  a.vfmadd132sd(xmmA, xmmB, anyptr_gpC);
  a.vfmadd132sd(xmmA, xmmB, xmmC);
  a.vfmadd132ss(xmmA, xmmB, anyptr_gpC);
  a.vfmadd132ss(xmmA, xmmB, xmmC);
  a.vfmadd213pd(xmmA, xmmB, anyptr_gpC);
  a.vfmadd213pd(xmmA, xmmB, xmmC);
  a.vfmadd213pd(ymmA, ymmB, anyptr_gpC);
  a.vfmadd213pd(ymmA, ymmB, ymmC);
  a.vfmadd213ps(xmmA, xmmB, anyptr_gpC);
  a.vfmadd213ps(xmmA, xmmB, xmmC);
  a.vfmadd213ps(ymmA, ymmB, anyptr_gpC);
  a.vfmadd213ps(ymmA, ymmB, ymmC);
  a.vfmadd213sd(xmmA, xmmB, anyptr_gpC);
  a.vfmadd213sd(xmmA, xmmB, xmmC);
  a.vfmadd213ss(xmmA, xmmB, anyptr_gpC);
  a.vfmadd213ss(xmmA, xmmB, xmmC);
  a.vfmadd231pd(xmmA, xmmB, anyptr_gpC);
  a.vfmadd231pd(xmmA, xmmB, xmmC);
  a.vfmadd231pd(ymmA, ymmB, anyptr_gpC);
  a.vfmadd231pd(ymmA, ymmB, ymmC);
  a.vfmadd231ps(xmmA, xmmB, anyptr_gpC);
  a.vfmadd231ps(xmmA, xmmB, xmmC);
  a.vfmadd231ps(ymmA, ymmB, anyptr_gpC);
  a.vfmadd231ps(ymmA, ymmB, ymmC);
  a.vfmadd231sd(xmmA, xmmB, anyptr_gpC);
  a.vfmadd231sd(xmmA, xmmB, xmmC);
  a.vfmadd231ss(xmmA, xmmB, anyptr_gpC);
  a.vfmadd231ss(xmmA, xmmB, xmmC);
  a.vfmaddsub132pd(xmmA, xmmB, anyptr_gpC);
  a.vfmaddsub132pd(xmmA, xmmB, xmmC);
  a.vfmaddsub132pd(ymmA, ymmB, anyptr_gpC);
  a.vfmaddsub132pd(ymmA, ymmB, ymmC);
  a.vfmaddsub132ps(xmmA, xmmB, anyptr_gpC);
  a.vfmaddsub132ps(xmmA, xmmB, xmmC);
  a.vfmaddsub132ps(ymmA, ymmB, anyptr_gpC);
  a.vfmaddsub132ps(ymmA, ymmB, ymmC);
  a.vfmaddsub213pd(xmmA, xmmB, anyptr_gpC);
  a.vfmaddsub213pd(xmmA, xmmB, xmmC);
  a.vfmaddsub213pd(ymmA, ymmB, anyptr_gpC);
  a.vfmaddsub213pd(ymmA, ymmB, ymmC);
  a.vfmaddsub213ps(xmmA, xmmB, anyptr_gpC);
  a.vfmaddsub213ps(xmmA, xmmB, xmmC);
  a.vfmaddsub213ps(ymmA, ymmB, anyptr_gpC);
  a.vfmaddsub213ps(ymmA, ymmB, ymmC);
  a.vfmaddsub231pd(xmmA, xmmB, anyptr_gpC);
  a.vfmaddsub231pd(xmmA, xmmB, xmmC);
  a.vfmaddsub231pd(ymmA, ymmB, anyptr_gpC);
  a.vfmaddsub231pd(ymmA, ymmB, ymmC);
  a.vfmaddsub231ps(xmmA, xmmB, anyptr_gpC);
  a.vfmaddsub231ps(xmmA, xmmB, xmmC);
  a.vfmaddsub231ps(ymmA, ymmB, anyptr_gpC);
  a.vfmaddsub231ps(ymmA, ymmB, ymmC);
  a.vfmsub132pd(xmmA, xmmB, anyptr_gpC);
  a.vfmsub132pd(xmmA, xmmB, xmmC);
  a.vfmsub132pd(ymmA, ymmB, anyptr_gpC);
  a.vfmsub132pd(ymmA, ymmB, ymmC);
  a.vfmsub132ps(xmmA, xmmB, anyptr_gpC);
  a.vfmsub132ps(xmmA, xmmB, xmmC);
  a.vfmsub132ps(ymmA, ymmB, anyptr_gpC);
  a.vfmsub132ps(ymmA, ymmB, ymmC);
  a.vfmsub132sd(xmmA, xmmB, anyptr_gpC);
  a.vfmsub132sd(xmmA, xmmB, xmmC);
  a.vfmsub132ss(xmmA, xmmB, anyptr_gpC);
  a.vfmsub132ss(xmmA, xmmB, xmmC);
  a.vfmsub213pd(xmmA, xmmB, anyptr_gpC);
  a.vfmsub213pd(xmmA, xmmB, xmmC);
  a.vfmsub213pd(ymmA, ymmB, anyptr_gpC);
  a.vfmsub213pd(ymmA, ymmB, ymmC);
  a.vfmsub213ps(xmmA, xmmB, anyptr_gpC);
  a.vfmsub213ps(xmmA, xmmB, xmmC);
  a.vfmsub213ps(ymmA, ymmB, anyptr_gpC);
  a.vfmsub213ps(ymmA, ymmB, ymmC);
  a.vfmsub213sd(xmmA, xmmB, anyptr_gpC);
  a.vfmsub213sd(xmmA, xmmB, xmmC);
  a.vfmsub213ss(xmmA, xmmB, anyptr_gpC);
  a.vfmsub213ss(xmmA, xmmB, xmmC);
  a.vfmsub231pd(xmmA, xmmB, anyptr_gpC);
  a.vfmsub231pd(xmmA, xmmB, xmmC);
  a.vfmsub231pd(ymmA, ymmB, anyptr_gpC);
  a.vfmsub231pd(ymmA, ymmB, ymmC);
  a.vfmsub231ps(xmmA, xmmB, anyptr_gpC);
  a.vfmsub231ps(xmmA, xmmB, xmmC);
  a.vfmsub231ps(ymmA, ymmB, anyptr_gpC);
  a.vfmsub231ps(ymmA, ymmB, ymmC);
  a.vfmsub231sd(xmmA, xmmB, anyptr_gpC);
  a.vfmsub231sd(xmmA, xmmB, xmmC);
  a.vfmsub231ss(xmmA, xmmB, anyptr_gpC);
  a.vfmsub231ss(xmmA, xmmB, xmmC);
  a.vfmsubadd132pd(xmmA, xmmB, anyptr_gpC);
  a.vfmsubadd132pd(xmmA, xmmB, xmmC);
  a.vfmsubadd132pd(ymmA, ymmB, anyptr_gpC);
  a.vfmsubadd132pd(ymmA, ymmB, ymmC);
  a.vfmsubadd132ps(xmmA, xmmB, anyptr_gpC);
  a.vfmsubadd132ps(xmmA, xmmB, xmmC);
  a.vfmsubadd132ps(ymmA, ymmB, anyptr_gpC);
  a.vfmsubadd132ps(ymmA, ymmB, ymmC);
  a.vfmsubadd213pd(xmmA, xmmB, anyptr_gpC);
  a.vfmsubadd213pd(xmmA, xmmB, xmmC);
  a.vfmsubadd213pd(ymmA, ymmB, anyptr_gpC);
  a.vfmsubadd213pd(ymmA, ymmB, ymmC);
  a.vfmsubadd213ps(xmmA, xmmB, anyptr_gpC);
  a.vfmsubadd213ps(xmmA, xmmB, xmmC);
  a.vfmsubadd213ps(ymmA, ymmB, anyptr_gpC);
  a.vfmsubadd213ps(ymmA, ymmB, ymmC);
  a.vfmsubadd231pd(xmmA, xmmB, anyptr_gpC);
  a.vfmsubadd231pd(xmmA, xmmB, xmmC);
  a.vfmsubadd231pd(ymmA, ymmB, anyptr_gpC);
  a.vfmsubadd231pd(ymmA, ymmB, ymmC);
  a.vfmsubadd231ps(xmmA, xmmB, anyptr_gpC);
  a.vfmsubadd231ps(xmmA, xmmB, xmmC);
  a.vfmsubadd231ps(ymmA, ymmB, anyptr_gpC);
  a.vfmsubadd231ps(ymmA, ymmB, ymmC);
  a.vfnmadd132pd(xmmA, xmmB, anyptr_gpC);
  a.vfnmadd132pd(xmmA, xmmB, xmmC);
  a.vfnmadd132pd(ymmA, ymmB, anyptr_gpC);
  a.vfnmadd132pd(ymmA, ymmB, ymmC);
  a.vfnmadd132ps(xmmA, xmmB, anyptr_gpC);
  a.vfnmadd132ps(xmmA, xmmB, xmmC);
  a.vfnmadd132ps(ymmA, ymmB, anyptr_gpC);
  a.vfnmadd132ps(ymmA, ymmB, ymmC);
  a.vfnmadd132sd(xmmA, xmmB, anyptr_gpC);
  a.vfnmadd132sd(xmmA, xmmB, xmmC);
  a.vfnmadd132ss(xmmA, xmmB, anyptr_gpC);
  a.vfnmadd132ss(xmmA, xmmB, xmmC);
  a.vfnmadd213pd(xmmA, xmmB, anyptr_gpC);
  a.vfnmadd213pd(xmmA, xmmB, xmmC);
  a.vfnmadd213pd(ymmA, ymmB, anyptr_gpC);
  a.vfnmadd213pd(ymmA, ymmB, ymmC);
  a.vfnmadd213ps(xmmA, xmmB, anyptr_gpC);
  a.vfnmadd213ps(xmmA, xmmB, xmmC);
  a.vfnmadd213ps(ymmA, ymmB, anyptr_gpC);
  a.vfnmadd213ps(ymmA, ymmB, ymmC);
  a.vfnmadd213sd(xmmA, xmmB, anyptr_gpC);
  a.vfnmadd213sd(xmmA, xmmB, xmmC);
  a.vfnmadd213ss(xmmA, xmmB, anyptr_gpC);
  a.vfnmadd213ss(xmmA, xmmB, xmmC);
  a.vfnmadd231pd(xmmA, xmmB, anyptr_gpC);
  a.vfnmadd231pd(xmmA, xmmB, xmmC);
  a.vfnmadd231pd(ymmA, ymmB, anyptr_gpC);
  a.vfnmadd231pd(ymmA, ymmB, ymmC);
  a.vfnmadd231ps(xmmA, xmmB, anyptr_gpC);
  a.vfnmadd231ps(xmmA, xmmB, xmmC);
  a.vfnmadd231ps(ymmA, ymmB, anyptr_gpC);
  a.vfnmadd231ps(ymmA, ymmB, ymmC);
  a.vfnmadd231sd(xmmA, xmmB, anyptr_gpC);
  a.vfnmadd231sd(xmmA, xmmB, xmmC);
  a.vfnmadd231ss(xmmA, xmmB, anyptr_gpC);
  a.vfnmadd231ss(xmmA, xmmB, xmmC);
  a.vfnmsub132pd(xmmA, xmmB, anyptr_gpC);
  a.vfnmsub132pd(xmmA, xmmB, xmmC);
  a.vfnmsub132pd(ymmA, ymmB, anyptr_gpC);
  a.vfnmsub132pd(ymmA, ymmB, ymmC);
  a.vfnmsub132ps(xmmA, xmmB, anyptr_gpC);
  a.vfnmsub132ps(xmmA, xmmB, xmmC);
  a.vfnmsub132ps(ymmA, ymmB, anyptr_gpC);
  a.vfnmsub132ps(ymmA, ymmB, ymmC);
  a.vfnmsub132sd(xmmA, xmmB, anyptr_gpC);
  a.vfnmsub132sd(xmmA, xmmB, xmmC);
  a.vfnmsub132ss(xmmA, xmmB, anyptr_gpC);
  a.vfnmsub132ss(xmmA, xmmB, xmmC);
  a.vfnmsub213pd(xmmA, xmmB, anyptr_gpC);
  a.vfnmsub213pd(xmmA, xmmB, xmmC);
  a.vfnmsub213pd(ymmA, ymmB, anyptr_gpC);
  a.vfnmsub213pd(ymmA, ymmB, ymmC);
  a.vfnmsub213ps(xmmA, xmmB, anyptr_gpC);
  a.vfnmsub213ps(xmmA, xmmB, xmmC);
  a.vfnmsub213ps(ymmA, ymmB, anyptr_gpC);
  a.vfnmsub213ps(ymmA, ymmB, ymmC);
  a.vfnmsub213sd(xmmA, xmmB, anyptr_gpC);
  a.vfnmsub213sd(xmmA, xmmB, xmmC);
  a.vfnmsub213ss(xmmA, xmmB, anyptr_gpC);
  a.vfnmsub213ss(xmmA, xmmB, xmmC);
  a.vfnmsub231pd(xmmA, xmmB, anyptr_gpC);
  a.vfnmsub231pd(xmmA, xmmB, xmmC);
  a.vfnmsub231pd(ymmA, ymmB, anyptr_gpC);
  a.vfnmsub231pd(ymmA, ymmB, ymmC);
  a.vfnmsub231ps(xmmA, xmmB, anyptr_gpC);
  a.vfnmsub231ps(xmmA, xmmB, xmmC);
  a.vfnmsub231ps(ymmA, ymmB, anyptr_gpC);
  a.vfnmsub231ps(ymmA, ymmB, ymmC);
  a.vfnmsub231sd(xmmA, xmmB, anyptr_gpC);
  a.vfnmsub231sd(xmmA, xmmB, xmmC);
  a.vfnmsub231ss(xmmA, xmmB, anyptr_gpC);
  a.vfnmsub231ss(xmmA, xmmB, xmmC);

  // FMA4.
  a.nop();

  a.vfmaddpd(xmmA, xmmB, xmmC, xmmD);
  a.vfmaddpd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfmaddpd(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfmaddpd(ymmA, ymmB, ymmC, ymmD);
  a.vfmaddpd(ymmA, ymmB, anyptr_gpC, ymmD);
  a.vfmaddpd(ymmA, ymmB, ymmC, anyptr_gpD);
  a.vfmaddps(xmmA, xmmB, xmmC, xmmD);
  a.vfmaddps(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfmaddps(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfmaddps(ymmA, ymmB, ymmC, ymmD);
  a.vfmaddps(ymmA, ymmB, anyptr_gpC, ymmD);
  a.vfmaddps(ymmA, ymmB, ymmC, anyptr_gpD);
  a.vfmaddsd(xmmA, xmmB, xmmC, xmmD);
  a.vfmaddsd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfmaddsd(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfmaddss(xmmA, xmmB, xmmC, xmmD);
  a.vfmaddss(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfmaddss(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfmaddsubpd(xmmA, xmmB, xmmC, xmmD);
  a.vfmaddsubpd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfmaddsubpd(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfmaddsubpd(ymmA, ymmB, ymmC, ymmD);
  a.vfmaddsubpd(ymmA, ymmB, anyptr_gpC, ymmD);
  a.vfmaddsubpd(ymmA, ymmB, ymmC, anyptr_gpD);
  a.vfmaddsubps(xmmA, xmmB, xmmC, xmmD);
  a.vfmaddsubps(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfmaddsubps(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfmaddsubps(ymmA, ymmB, ymmC, ymmD);
  a.vfmaddsubps(ymmA, ymmB, anyptr_gpC, ymmD);
  a.vfmaddsubps(ymmA, ymmB, ymmC, anyptr_gpD);
  a.vfmsubaddpd(xmmA, xmmB, xmmC, xmmD);
  a.vfmsubaddpd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfmsubaddpd(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfmsubaddpd(ymmA, ymmB, ymmC, ymmD);
  a.vfmsubaddpd(ymmA, ymmB, anyptr_gpC, ymmD);
  a.vfmsubaddpd(ymmA, ymmB, ymmC, anyptr_gpD);
  a.vfmsubaddps(xmmA, xmmB, xmmC, xmmD);
  a.vfmsubaddps(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfmsubaddps(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfmsubaddps(ymmA, ymmB, ymmC, ymmD);
  a.vfmsubaddps(ymmA, ymmB, anyptr_gpC, ymmD);
  a.vfmsubaddps(ymmA, ymmB, ymmC, anyptr_gpD);
  a.vfmsubpd(xmmA, xmmB, xmmC, xmmD);
  a.vfmsubpd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfmsubpd(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfmsubpd(ymmA, ymmB, ymmC, ymmD);
  a.vfmsubpd(ymmA, ymmB, anyptr_gpC, ymmD);
  a.vfmsubpd(ymmA, ymmB, ymmC, anyptr_gpD);
  a.vfmsubps(xmmA, xmmB, xmmC, xmmD);
  a.vfmsubps(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfmsubps(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfmsubps(ymmA, ymmB, ymmC, ymmD);
  a.vfmsubps(ymmA, ymmB, anyptr_gpC, ymmD);
  a.vfmsubps(ymmA, ymmB, ymmC, anyptr_gpD);
  a.vfmsubsd(xmmA, xmmB, xmmC, xmmD);
  a.vfmsubsd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfmsubsd(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfmsubss(xmmA, xmmB, xmmC, xmmD);
  a.vfmsubss(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfmsubss(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfnmaddpd(xmmA, xmmB, xmmC, xmmD);
  a.vfnmaddpd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfnmaddpd(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfnmaddpd(ymmA, ymmB, ymmC, ymmD);
  a.vfnmaddpd(ymmA, ymmB, anyptr_gpC, ymmD);
  a.vfnmaddpd(ymmA, ymmB, ymmC, anyptr_gpD);
  a.vfnmaddps(xmmA, xmmB, xmmC, xmmD);
  a.vfnmaddps(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfnmaddps(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfnmaddps(ymmA, ymmB, ymmC, ymmD);
  a.vfnmaddps(ymmA, ymmB, anyptr_gpC, ymmD);
  a.vfnmaddps(ymmA, ymmB, ymmC, anyptr_gpD);
  a.vfnmaddsd(xmmA, xmmB, xmmC, xmmD);
  a.vfnmaddsd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfnmaddsd(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfnmaddss(xmmA, xmmB, xmmC, xmmD);
  a.vfnmaddss(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfnmaddss(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfnmsubpd(xmmA, xmmB, xmmC, xmmD);
  a.vfnmsubpd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfnmsubpd(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfnmsubpd(ymmA, ymmB, ymmC, ymmD);
  a.vfnmsubpd(ymmA, ymmB, anyptr_gpC, ymmD);
  a.vfnmsubpd(ymmA, ymmB, ymmC, anyptr_gpD);
  a.vfnmsubps(xmmA, xmmB, xmmC, xmmD);
  a.vfnmsubps(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfnmsubps(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfnmsubps(ymmA, ymmB, ymmC, ymmD);
  a.vfnmsubps(ymmA, ymmB, anyptr_gpC, ymmD);
  a.vfnmsubps(ymmA, ymmB, ymmC, anyptr_gpD);
  a.vfnmsubsd(xmmA, xmmB, xmmC, xmmD);
  a.vfnmsubsd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfnmsubsd(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vfnmsubss(xmmA, xmmB, xmmC, xmmD);
  a.vfnmsubss(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vfnmsubss(xmmA, xmmB, xmmC, anyptr_gpD);

  // XOP.
  a.nop();

  a.vfrczpd(xmmA, xmmB);
  a.vfrczpd(xmmA, anyptr_gpB);
  a.vfrczpd(ymmA, ymmB);
  a.vfrczpd(ymmA, anyptr_gpB);
  a.vfrczps(xmmA, xmmB);
  a.vfrczps(xmmA, anyptr_gpB);
  a.vfrczps(ymmA, ymmB);
  a.vfrczps(ymmA, anyptr_gpB);
  a.vfrczsd(xmmA, xmmB);
  a.vfrczsd(xmmA, anyptr_gpB);
  a.vfrczss(xmmA, xmmB);
  a.vfrczss(xmmA, anyptr_gpB);
  a.vpcmov(xmmA, xmmB, xmmC, xmmD);
  a.vpcmov(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vpcmov(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vpcmov(ymmA, ymmB, ymmC, ymmD);
  a.vpcmov(ymmA, ymmB, anyptr_gpC, ymmD);
  a.vpcmov(ymmA, ymmB, ymmC, anyptr_gpD);
  a.vpcomb(xmmA, xmmB, xmmC, 0);
  a.vpcomb(xmmA, xmmB, anyptr_gpC, 0);
  a.vpcomd(xmmA, xmmB, xmmC, 0);
  a.vpcomd(xmmA, xmmB, anyptr_gpC, 0);
  a.vpcomq(xmmA, xmmB, xmmC, 0);
  a.vpcomq(xmmA, xmmB, anyptr_gpC, 0);
  a.vpcomw(xmmA, xmmB, xmmC, 0);
  a.vpcomw(xmmA, xmmB, anyptr_gpC, 0);
  a.vpcomub(xmmA, xmmB, xmmC, 0);
  a.vpcomub(xmmA, xmmB, anyptr_gpC, 0);
  a.vpcomud(xmmA, xmmB, xmmC, 0);
  a.vpcomud(xmmA, xmmB, anyptr_gpC, 0);
  a.vpcomuq(xmmA, xmmB, xmmC, 0);
  a.vpcomuq(xmmA, xmmB, anyptr_gpC, 0);
  a.vpcomuw(xmmA, xmmB, xmmC, 0);
  a.vpcomuw(xmmA, xmmB, anyptr_gpC, 0);
  a.vpermil2pd(xmmA, xmmB, xmmC, xmmD);
  a.vpermil2pd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vpermil2pd(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vpermil2pd(ymmA, ymmB, ymmC, ymmD);
  a.vpermil2pd(ymmA, ymmB, anyptr_gpC, ymmD);
  a.vpermil2pd(ymmA, ymmB, ymmC, anyptr_gpD);
  a.vpermil2ps(xmmA, xmmB, xmmC, xmmD);
  a.vpermil2ps(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vpermil2ps(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vpermil2ps(ymmA, ymmB, ymmC, ymmD);
  a.vpermil2ps(ymmA, ymmB, anyptr_gpC, ymmD);
  a.vpermil2ps(ymmA, ymmB, ymmC, anyptr_gpD);
  a.vphaddbd(xmmA, xmmB);
  a.vphaddbd(xmmA, anyptr_gpB);
  a.vphaddbq(xmmA, xmmB);
  a.vphaddbq(xmmA, anyptr_gpB);
  a.vphaddbw(xmmA, xmmB);
  a.vphaddbw(xmmA, anyptr_gpB);
  a.vphadddq(xmmA, xmmB);
  a.vphadddq(xmmA, anyptr_gpB);
  a.vphaddwd(xmmA, xmmB);
  a.vphaddwd(xmmA, anyptr_gpB);
  a.vphaddwq(xmmA, xmmB);
  a.vphaddwq(xmmA, anyptr_gpB);
  a.vphaddubd(xmmA, xmmB);
  a.vphaddubd(xmmA, anyptr_gpB);
  a.vphaddubq(xmmA, xmmB);
  a.vphaddubq(xmmA, anyptr_gpB);
  a.vphaddubw(xmmA, xmmB);
  a.vphaddubw(xmmA, anyptr_gpB);
  a.vphaddudq(xmmA, xmmB);
  a.vphaddudq(xmmA, anyptr_gpB);
  a.vphadduwd(xmmA, xmmB);
  a.vphadduwd(xmmA, anyptr_gpB);
  a.vphadduwq(xmmA, xmmB);
  a.vphadduwq(xmmA, anyptr_gpB);
  a.vphsubbw(xmmA, xmmB);
  a.vphsubbw(xmmA, anyptr_gpB);
  a.vphsubdq(xmmA, xmmB);
  a.vphsubdq(xmmA, anyptr_gpB);
  a.vphsubwd(xmmA, xmmB);
  a.vphsubwd(xmmA, anyptr_gpB);
  a.vpmacsdd(xmmA, xmmB, xmmC, xmmD);
  a.vpmacsdd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vpmacsdqh(xmmA, xmmB, xmmC, xmmD);
  a.vpmacsdqh(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vpmacsdql(xmmA, xmmB, xmmC, xmmD);
  a.vpmacsdql(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vpmacswd(xmmA, xmmB, xmmC, xmmD);
  a.vpmacswd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vpmacsww(xmmA, xmmB, xmmC, xmmD);
  a.vpmacsww(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vpmacssdd(xmmA, xmmB, xmmC, xmmD);
  a.vpmacssdd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vpmacssdqh(xmmA, xmmB, xmmC, xmmD);
  a.vpmacssdqh(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vpmacssdql(xmmA, xmmB, xmmC, xmmD);
  a.vpmacssdql(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vpmacsswd(xmmA, xmmB, xmmC, xmmD);
  a.vpmacsswd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vpmacssww(xmmA, xmmB, xmmC, xmmD);
  a.vpmacssww(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vpmadcsswd(xmmA, xmmB, xmmC, xmmD);
  a.vpmadcsswd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vpmadcswd(xmmA, xmmB, xmmC, xmmD);
  a.vpmadcswd(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vpperm(xmmA, xmmB, xmmC, xmmD);
  a.vpperm(xmmA, xmmB, anyptr_gpC, xmmD);
  a.vpperm(xmmA, xmmB, xmmC, anyptr_gpD);
  a.vprotb(xmmA, xmmB, xmmC);
  a.vprotb(xmmA, anyptr_gpB, xmmC);
  a.vprotb(xmmA, xmmB, anyptr_gpC);
  a.vprotb(xmmA, xmmB, 0);
  a.vprotb(xmmA, anyptr_gpB, 0);
  a.vprotd(xmmA, xmmB, xmmC);
  a.vprotd(xmmA, anyptr_gpB, xmmC);
  a.vprotd(xmmA, xmmB, anyptr_gpC);
  a.vprotd(xmmA, xmmB, 0);
  a.vprotd(xmmA, anyptr_gpB, 0);
  a.vprotq(xmmA, xmmB, xmmC);
  a.vprotq(xmmA, anyptr_gpB, xmmC);
  a.vprotq(xmmA, xmmB, anyptr_gpC);
  a.vprotq(xmmA, xmmB, 0);
  a.vprotq(xmmA, anyptr_gpB, 0);
  a.vprotw(xmmA, xmmB, xmmC);
  a.vprotw(xmmA, anyptr_gpB, xmmC);
  a.vprotw(xmmA, xmmB, anyptr_gpC);
  a.vprotw(xmmA, xmmB, 0);
  a.vprotw(xmmA, anyptr_gpB, 0);
  a.vpshab(xmmA, xmmB, xmmC);
  a.vpshab(xmmA, anyptr_gpB, xmmC);
  a.vpshab(xmmA, xmmB, anyptr_gpC);
  a.vpshad(xmmA, xmmB, xmmC);
  a.vpshad(xmmA, anyptr_gpB, xmmC);
  a.vpshad(xmmA, xmmB, anyptr_gpC);
  a.vpshaq(xmmA, xmmB, xmmC);
  a.vpshaq(xmmA, anyptr_gpB, xmmC);
  a.vpshaq(xmmA, xmmB, anyptr_gpC);
  a.vpshaw(xmmA, xmmB, xmmC);
  a.vpshaw(xmmA, anyptr_gpB, xmmC);
  a.vpshaw(xmmA, xmmB, anyptr_gpC);
  a.vpshlb(xmmA, xmmB, xmmC);
  a.vpshlb(xmmA, anyptr_gpB, xmmC);
  a.vpshlb(xmmA, xmmB, anyptr_gpC);
  a.vpshld(xmmA, xmmB, xmmC);
  a.vpshld(xmmA, anyptr_gpB, xmmC);
  a.vpshld(xmmA, xmmB, anyptr_gpC);
  a.vpshlq(xmmA, xmmB, xmmC);
  a.vpshlq(xmmA, anyptr_gpB, xmmC);
  a.vpshlq(xmmA, xmmB, anyptr_gpC);
  a.vpshlw(xmmA, xmmB, xmmC);
  a.vpshlw(xmmA, anyptr_gpB, xmmC);
  a.vpshlw(xmmA, xmmB, anyptr_gpC);

  // BMI.
  a.nop();

  a.andn(gzA, gzB, gzC);
  a.andn(gzA, gzB, anyptr_gpC);
  a.bextr(gzA, gzB, gzC);
  a.bextr(gzA, anyptr_gpB, gzC);
  a.blsi(gzA, gzB);
  a.blsi(gzA, anyptr_gpB);
  a.blsmsk(gzA, gzB);
  a.blsmsk(gzA, anyptr_gpB);
  a.blsr(gzA, gzB);
  a.blsr(gzA, anyptr_gpB);

  // LZCNT.
  a.nop();

  a.lzcnt(gzA, gzB);
  a.lzcnt(gzA, anyptr_gpB);

  // TZCNT.
  a.nop();

  a.tzcnt(gzA, gzB);
  a.tzcnt(gzA, anyptr_gpB);

  // BMI2.
  a.nop();

  a.bzhi(gzA, gzB, gzC);
  a.bzhi(gzA, anyptr_gpB, gzC);
  a.mulx(gzA, gzB, gzC);
  a.mulx(gzA, gzB, anyptr_gpC);
  a.pdep(gzA, gzB, gzC);
  a.pdep(gzA, gzB, anyptr_gpC);
  a.pext(gzA, gzB, gzC);
  a.pext(gzA, gzB, anyptr_gpC);
  a.rorx(gzA, gzB, 0);
  a.rorx(gzA, anyptr_gpB, 0);
  a.sarx(gzA, gzB, gzC);
  a.sarx(gzA, anyptr_gpB, gzC);
  a.shlx(gzA, gzB, gzC);
  a.shlx(gzA, anyptr_gpB, gzC);
  a.shrx(gzA, gzB, gzC);
  a.shrx(gzA, anyptr_gpB, gzC);

  // RDRAND.
  a.nop();

  a.rdrand(gzA);

  // F16C.
  a.nop();

  a.vcvtph2ps(xmmA, xmmB);
  a.vcvtph2ps(xmmA, anyptr_gpB);
  a.vcvtph2ps(ymmA, xmmB);
  a.vcvtph2ps(ymmA, anyptr_gpB);
  a.vcvtps2ph(xmmA, xmmB, 0);
  a.vcvtps2ph(anyptr_gpA, xmmB, 0);
  a.vcvtps2ph(xmmA, ymmB, 0);
  a.vcvtps2ph(anyptr_gpA, ymmB, 0);

  // Mark the end of the stream.
  a.nop();
}

} // asmgen namespace

// [Guard]
#endif // _TEST_ASMJIT_TEST_OPCODE_H
